In [None]:
!pip install -qU bs4 tiktoken openai langchain pinecone-client[grpc] pypdf[full]

In [None]:
import os
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"

pdf_folder_path = f'{root_dir}Super Doctor/Pharmbot_fulldoc'
print(os.listdir(pdf_folder_path))

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader(pdf_folder_path)
dataset = loader.load()

In [None]:
data = []

for doc in dataset:
    data.append({
        'reference': doc.metadata['source'].replace('rtdocs/', 'https://'),
        'text': doc.page_content
    })

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [None]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:

from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

for idx, record in enumerate(tqdm(data)):
    texts = text_splitter.split_text(record['text'])
    chunks.extend([{
        'id': str(uuid4()),
        'text': texts[i],
        'chunk': i,
        'reference': record['reference']
    } for i in range(len(texts))])

Initialize Embedding Model

In [None]:
import openai
import os

openai_api_key = 'xxx'
os.environ['OPENAI_API_KEY'] = openai_api_key

embed_model = "text-embedding-ada-002"

In [None]:
from pinecone import Pinecone, PodSpec

pc = Pinecone(api_key='xxx')

index_name='sdp-cl'

pc.create_index(
	name=index_name,
	dimension=1536,
	metric='cosine',
	spec=PodSpec(
		environment='gcp-starter',
	)
)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

In [None]:
from tqdm.auto import tqdm
import datetime
from time import sleep
from openai import OpenAI
client = OpenAI()

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(chunks), batch_size)):
    # find end of batch
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = client.embeddings.create(input=texts, model=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = client.embeddings.create(input=texts, model=embed_model)
                done = True
            except:
                pass
    embeds = [record.embedding for record in res.data]
    # cleanup metadata
    meta_batch = [{
        'text': x['text'],
        'chunk': x['chunk'],
        'reference': x['reference']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

Retrieval

In [None]:
from pinecone import Pinecone, PodSpec

pc = Pinecone(api_key='xxx')

index_name='sdp-cl'

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00022,
 'namespaces': {'': {'vector_count': 22}},
 'total_vector_count': 22}

In [None]:
from openai import OpenAI
client = OpenAI()
query = str("xxx")
#clindoc_query=str("")
res = client.embeddings.create(
    input=[query],
    model=embed_model
)

# retrieve from Pinecone
xq = res.data[0].embedding

# get relevant contexts (including the questions)
res = index.query(vector=xq, top_k=5, include_metadata=True)

Retrieval Augmented Generation

In [None]:
# get list of retrieved text
contexts = [item['metadata']['text'] for item in res['matches']]

augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

In [None]:
print(augmented_query)

In [None]:
res = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": 'xxx'},
        {"role": "user", "content": augmented_query}
    ]
)

In [None]:
from IPython.display import Markdown

display(Markdown(res['choices'][0]['message']['content']))