In [1]:
import os
import pandas as pd
import openai
import uuid
from tqdm import tqdm
from pinecone import Pinecone, PodSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter #to split the text into smaller chunks
from langchain.embeddings.openai import OpenAIEmbeddings #to convert text to embeddings for Pinecone vector DB

In [2]:
#Setting up some variables
openai_api_key = os.environ.get("OPENAI_API_KEY")
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT")
pinecone_index_name = 'test'
pinecone_namespace = 'healthhub'

In [3]:
#setting up pinecone
pc = Pinecone(api_key=pinecone_api_key, environment=pinecone_environment)
spec = PodSpec(environment=pinecone_environment)

if pinecone_index_name in pc.list_indexes().names():
    index = pc.Index(pinecone_index_name)
else:
    raise Exception('Pinecone Index not set up or running')

#setting up text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, #larger chunk sizes means we retain more context, but more expensive API calls to OpenAI
    chunk_overlap=20,
)

#setting up embeddings
embed = OpenAIEmbeddings(
    model='text-embedding-ada-002',
    openai_api_key=openai_api_key
)

  warn_deprecated(


In [4]:
#function to read healthhub data into pinecone
def healthhub_data_into_pinecone():
    df = pd.read_csv("documents/healthhub_data.csv")
    
    vectors = []
    
    for i in tqdm(range(len(df))):
        text = df['title'][i] + " " + df['text'][i]
        
        chunks = text_splitter.split_text(text) #splitting the text into smaller chunks
        
        for chunk in chunks:
            embed_text = embed.embed_query(chunk) #embedding each chunk
            id = uuid.uuid4().hex
            vectors.append(
                {
                    'metadata': {
                        'text': chunk
                    },
                    'id': id,
                    'values': embed_text
                }
            )
            
            #recommended to batch insert vectors, with each batch being around 100 vectors
            if len(vectors) > 70:
                index.upsert(vectors, namespace=pinecone_namespace)
                vectors = []

    if len(vectors) > 0:
        index.upsert(vectors, namespace=pinecone_namespace)
        vectors = []
    

In [5]:
#healthhub_data_into_pinecone() #uncomment if you need to insert all the healthhub data into pinecone
#index.delete(delete_all=True, namespace=pinecone_namespace) #uncomment if you want to delete all healthhub vectors in the index

100%|██████████| 974/974 [3:11:05<00:00, 11.77s/it]  
