In [1]:
!pip install ipykernel
!pip install datasets
!pip install git-lfs
!pip install torch==2.1.0
!pip install --upgrade torchaudio torchdata torchtext torchvision
!pip install accelerate
!pip install evaluate
!pip install tensorflow
!pip install sentence-transformers
!pip install transformers==4.31.0
!pip install sentence-transformers==2.2.2
!pip install pinecone-client==2.2.2
!pip install datasets==2.14.0
!pip install accelerate==0.21.0
!pip install einops==0.6.1
!pip install langchain==0.0.240
!pip install xformers==0.0.20
!pip install bitsandbytes==0.41.0




In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from huggingface_hub import notebook_login
from datasets import Dataset
from datasets import load_dataset
import transformers

from evaluate import load


import torch

from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from google.colab import drive
import tensorflow as tf

from torch import cuda, bfloat16
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os
import pinecone
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from transformers import StoppingCriteria, StoppingCriteriaList
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
import time
import sys
import json
from secrets import secrets 





In [3]:
drive.mount('/content/drive')
filepath = '/content/drive/My Drive/dataspeak/notebooks/dataspeak'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df_best_score = pd.read_csv(filepath, encoding='iso-8859-1')
#print(df_best_score.sample(10))


In [5]:
#df_best_score.info()

In [6]:
small_train_dataset, small_eval_dataset = train_test_split(df_best_score, test_size=0.02, train_size=0.05, random_state=54321)

small_train_dataset = small_train_dataset.reset_index(drop=True)
small_eval_dataset = small_eval_dataset.reset_index(drop=True)


In [7]:
train_title = small_train_dataset['title']
train_body_questions = small_train_dataset['body_question']
train_body_answers = small_train_dataset['body_answer']

#eval_title = small_eval_dataset['title']
#eval_body_questions = small_eval_dataset['body_question']
#eval_body_answers = small_eval_dataset['body_answer']

In [8]:
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [9]:
docs = [

    train_body_answers
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 1 doc embeddings, each with a dimensionality of 384.


In [10]:
pinecone_api = secrets.get('PINECONE_API') 
pinecone_env = secrets.get('PINECONE_ENVIRON')
pinecone.init(
    api_key=os.environ.get(pinecone_api) or pinecone_api,
    environment=os.environ.get(pinecone_env) or pinecone_env
)

In [11]:
index_name = 'dataspeak-qa'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )


In [12]:
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.30728,
 'namespaces': {'': {'vector_count': 30728}},
 'total_vector_count': 30728}

In [13]:
# Create or retrieve the Index
index_info = index.describe_index_stats()

if index_info['total_vector_count'] == 0:
    batch_size = 32
    max_metadata_size = 39000

    for i in range(0, len(small_train_dataset), batch_size):
        i_end = min(len(small_train_dataset), i + batch_size)
        batch = small_train_dataset.iloc[i:i_end]
        ids = [f"{x['id_question']}-{x['id_answer']}" for i, x in batch.iterrows()]
        texts = [(x['body_answer']) for i, x in batch.iterrows()]
        embeds = embed_model.embed_documents(texts)
        metadata = [
            {'text': x['body_answer']}
            for i, x in batch.iterrows()
        ]

        metadata_json = json.dumps(metadata, ensure_ascii=False)
        metadata_size = sys.getsizeof(metadata_json)

        if metadata_size > max_metadata_size:

            truncated_metadata = metadata[:20000] # Truncate text

            truncated_metadata_json = json.dumps(truncated_metadata, ensure_ascii=False)
            truncated_metadata_size = sys.getsizeof(truncated_metadata_json)

            index.upsert(vectors=zip(ids, embeds, truncated_metadata))
        else:
            index.upsert(vectors=zip(ids, embeds, metadata))
else:
  print('Vectors already exist. Please use existing index or start over.')

Vectors already exist. Please use existing index or start over.


In [14]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.30728,
 'namespaces': {'': {'vector_count': 30728}},
 'total_vector_count': 30728}

In [15]:
model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [16]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [17]:
hf_key = secrets.get('HUGGING_FACE_TOKEN')

hf_auth = hf_key
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

In [18]:
cache_dir = './content/drive/My Drive/dataspeak/'

In [19]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    #cache_dir = cache_dir,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
model.eval()

print(f"Model loaded on {device}")

Model loaded on cuda:0


In [21]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    #cache_dir=cache_dir,
    use_auth_token=hf_auth
)



In [22]:
stop_list = ['\nContext:', '\n```\n', '\nAnswer:']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([    1, 29871,    13,  2677, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0'),
 tensor([    1, 29871,    13, 22550, 29901], device='cuda:0')]

In [23]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [24]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    temperature=0.0,
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [25]:
llm = HuggingFacePipeline(pipeline=generate_text)

In [26]:
text_field = 'text'

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [27]:
query = 'tell me about python'

search_result = retriever=vectorstore.similarity_search(query,
    k=5  # returns top 5 most relevant chunks of text
)

print(search_result)

tags=['Pinecone'] metadata=None vectorstore=<langchain.vectorstores.pinecone.Pinecone object at 0x7e403c6fbf10> search_type='similarity' search_kwargs={}


In [28]:
question_answer = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever(k=5)
)



In [29]:
chat_history = []
def chatting(input):
  query = input
  result = question_answer({'query': query, 'chat_history': chat_history})
  chat_history.append(result['result'])
  print(result['result'])

  return result['result']



 Scipy is a scientific computing library for Python."


In [None]:
chatting('What is scipy?')

In [None]:
chatting('What is scikit-learn and how is it helpful?')

In [None]:
chatting('tell me about python')