In [None]:
# First, uninstall any existing installations of torch
!pip uninstall -y torch torchaudio torchvision

# Install the latest stable version of PyTorch with the appropriate CUDA version
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121


In [None]:
!pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  pinecone-client==2.2.2 \
  datasets==2.14.0 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain==0.0.240 \
  xformers==0.0.20 \
  bitsandbytes==0.41.0

In [None]:

from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embedding_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [None]:

import os
import pinecone
from google.colab import userdata

# API key from app.pinecone.io and environment from console
# The secrets are stored in colab secrets.
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = userdata.get('PINECONE_ENVIRONMENT')

pinecone.init(
    api_key=os.environ.get(PINECONE_API_KEY) or PINECONE_API_KEY,
    environment=os.environ.get(PINECONE_ENVIRONMENT) or PINECONE_ENVIRONMENT
)

In [None]:

import time

index_name = 'nlp-chatbot'
createdIndex = False

if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=384)
    createdIndex = True
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pinecone.Index(index_name)

In [None]:
from langchain.vectorstores import Pinecone
# Define the field in metadata where the text will be stored
text_field = 'text'

# Initialize the vectorstore using LangChain Pinecone
vectorstore = Pinecone(index, embedding_model.embed_query, text_field)

In [1]:
# Function to add data to Pinecone with metadata
def add_data_to_pinecone(json_data):
    for item in json_data['data']:
        title = item['title']
        for para in item['paragraphs']:
            context = para['context']
            for qas in para['qas']:
                question = qas['question']
                answers = [answer['text'] for answer in qas['answers']]

                # Combine title, context, question, and answers into a single text for embedding
                full_text = f"Title: {title}\nContext: {context}\nQuestion: {question}\nAnswers: {', '.join(answers)}"


                # Create metadata
                metadata = {
                    "id": qas['id'],
                    "title": title,
                    "context": context,
                }
                # Insert the embeddings into Pinecone with metadata
                #index.upsert(vectors=[(qas['id'], embeddings[0], metadata)])  # Store metadata
                vectorstore.add_texts([full_text], [metadata])


In [None]:
import json

file_path = '/content/sample_data/dev-v1.1.json'
with open(file_path, 'r') as f:
  data = json.load(f)

In [None]:
add_data_to_pinecone(data)

In [None]:

vectorstore.similarity_search(
    "'Which NFL team represented the AFC at Super Bowl 50?'",  # the search query
    k=3  # returns top 3 most relevant chunks of text
)