Preprocessing

In [None]:
!pip install -qU bs4 tiktoken openai langchain pinecone-client[grpc] pypdf[full]

In [None]:
pdf_folder_path = "sample-location" #clinical document location

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader(pdf_folder_path)
dataset = loader.load()

In [None]:
data = []

for doc in dataset:
    data.append({
        'reference': doc.metadata['source'].replace('rtdocs/', 'https://'),
        'text': doc.page_content
    })

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

for idx, record in enumerate(tqdm(data)):
    texts = text_splitter.split_text(record['text'])
    chunks.extend([{
        'id': str(uuid4()),
        'text': texts[i],
        'chunk': i,
        'reference': record['reference']
    } for i in range(len(texts))])

Embedding Model

In [None]:
import openai

openai.api_key = ""  #OpenAI API Key

embed_model = "text-embedding-ada-002"

Vector Storage

In [None]:
import pinecone

index_name = 'sample-vs'

pinecone.init(
    api_key="", #Pinecone API
    environment="gcp-starter"
)

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric='cosine'
    )

index = pinecone.Index(index_name)
index.describe_index_stats()

In [None]:
from tqdm.auto import tqdm
import datetime
from time import sleep
from openai import OpenAI
client = OpenAI()

batch_size = 100

for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    ids_batch = [x['id'] for x in meta_batch]
    texts = [x['text'] for x in meta_batch]
    try:
        res = client.embeddings.create(input=texts, model=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = client.embeddings.create(input=texts, model=embed_model)
                done = True
            except:
                pass
    embeds = [record.embedding for record in res.data]
    meta_batch = [{
        'text': x['text'],
        'chunk': x['chunk'],
        'reference': x['reference']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    index.upsert(vectors=to_upsert)

Retrieval Agent

In [None]:
import pinecone

index_name = 'sample-vs'

pinecone.init(
    api_key="",  #Pinecone API
    environment="gcp-starter"
)

index = pinecone.Index(index_name)
index.describe_index_stats()

In [None]:
from openai import OpenAI
client = OpenAI(api_key="")

query = str("") #clinical query

res = client.embeddings.create(
    input=[query],
    model=embed_model
)

xq = res.data[0].embedding
res = index.query(xq, top_k=10, include_metadata=True)

Response Generation

In [None]:
contexts = [item['metadata']['text'] for item in res['matches']]
augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

In [None]:
print(augmented_query)

LLM Integration (GPT 4)

In [None]:
from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
  model="gpt-4",
  messages=[
    {"role": "system", "content": ""}, #System Prompt
    {"role": "user", "content": augmented_query},
  ]
)

In [None]:
response['choices'][0]['message']['content']