In [1]:
# Importing required libraries
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import os
import torch

  from tqdm.autonotebook import tqdm


In [2]:
# Adding api key
PINECONE_API_KEY = "3f0e285f-3731-453e-9c68-d4a4563070b5"
PINECONE_INDEX_NAME = "medi-chat-2024"

### Extract data and create embedding vectors

In [3]:
# Creating a loader to load pdf data
def load_pdf_data(data_path):
    loader = DirectoryLoader(
        path=data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )

    docs = loader.load()
    return docs

In [6]:
pdf_docs = load_pdf_data("../data")

100%|██████████| 1/1 [00:14<00:00, 14.01s/it]


In [7]:
pdf_docs[40]

Document(page_content='The symptoms of CO poisoning in order of increas-\ning severity include:\n• headache\n• shortness of breath\n• dizziness\n• fatigue• mental confusion and difficulty thinking\n• loss of fine hand-eye coordination\n• nausea and vomiting• rapid heart rate\n• hallucinations\n• inability to execute voluntary movements accurately• collapse\n• lowered body temperature ( hypothermia )\n• coma• convulsions• seriously low blood pressure\n• cardiac and respiratory failure\n• death\nIn some cases, the skin, mucous membranes, and\nnails of a person with CO poisoning are cherry red orbright pink. Because the color change doesn’t alwaysoccur, it is an unreliable symptom to rely on for diagnosis.\nAlthough most CO poisoning is acute, or sudden, it is\npossible to suffer from chronic CO poisoning. This condi-tion exists when a person is exposed to low levels of the gasover a period of days to months. Symptoms are often vagueand include (in order of frequency) fatigue, headache,di

In [8]:
# Splitting the data into chunks
def get_text_chunks(data):
    # Initialize the text splitter class
    extracted_chunks = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=30,
    )
    # Get the split text/chunks using split_documents
    doc_split = extracted_chunks.split_documents(data)
    return doc_split

In [9]:
doc_chunks = get_text_chunks(pdf_docs)
doc_chunks[14]

Document(page_content='Volume 5: T-Z ........................................ 3237\nOrganizations ............................................ 3603\nGeneral Index ............................................ 3625\nGALE ENCYCLOPEDIA OF MEDICINE 2 VCONTENTS', metadata={'source': '..\\data\\medical-book.pdf', 'page': 3})

In [10]:
# Initializing the embedding model
def get_hugging_face_embedding():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [11]:
minilm_embedding = get_hugging_face_embedding()

In [12]:
# Adding pinecone api key to script environment
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [13]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = PINECONE_INDEX_NAME

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 


In [14]:
# Function to create a vector store in pinecone
# vectorstore = PineconeVectorStore.from_documents(doc_chunks, embedding=minilm_embedding, index_name=PINECONE_INDEX_NAME)
# Getting the vector details
def get_pinecone_vectorestore(doc_chunks, embedding, index_name):
    vector_details = pc.Index(index_name).describe_index_stats()
    print(vector_details)
    if vector_details['total_vector_count'] == 0:
        vectorstore = PineconeVectorStore.from_documents(doc_chunks, embedding=embedding, index_name=index_name)
    else:
        vectorstore = PineconeVectorStore(index_name=index_name, embedding=embedding)
    return vectorstore

In [15]:
vectorstore = get_pinecone_vectorestore(doc_chunks=doc_chunks, embedding=minilm_embedding, index_name=PINECONE_INDEX_NAME)

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 8462}},
 'total_vector_count': 8462}


In [16]:
ex_query = "What is Melanoma"
similar_search = vectorstore.similarity_search(query=ex_query)
similar_search

  attn_output = torch.nn.functional.scaled_dot_product_attention(


[Document(page_content='Intraocular melanoma is a rare cancer overall, yet it\nis the most common eye cancer seen in adults. It is whencancer cells are found in the uvea of the eye. The uveaincludes the iris (the colored portion of eye), the ciliarybody (an eye muscle that focuses the lens) and thechoroid (found in the back of the eye next to the retina).Intraocular cancer of the iris usually grows slowly', metadata={'page': 646.0, 'source': 'data\\medical-book.pdf'}),
 Document(page_content='pigment cells (melanocytes).\n• Sarcomas are cancers of the supporting tissues of the\nbody, such as bone, muscle and blood vessels.\n• Cancers of the blood and lymph glands are called\nleukemias and lymphomas respectively.\n• Gliomas are cancers of the nerve tissue.\nCauses and symptoms\nThe major risk factors for cancer are: tobacco, alco-\nhol, diet, sexual and reproductive behavior, infectiousagents, family history, occupation, environment and pol-lution.', metadata={'page': 20.0, 'source': 'd

In [44]:
# Adding LLM prompt engineering
llm_prompt = """You are a knowledgeable assistant. Based on the provided context, provide precise to the point answer.
Do not repeat the same line of response if the answer is not known. Just mention that you do not know the answer.
Context: {context}
Question: {question}
Answer:
"""


In [45]:
# Creating a prompt template
prompt_template = PromptTemplate(template=llm_prompt, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": prompt_template}


In [46]:
# Instantiate llama-2 llm model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Model running on {device}")
llm = CTransformers(model="../model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    device=device,
                    config={'max_new_tokens':200,
                            'temperature': 0.3}
                    )

Model running on cuda


In [47]:
# Creating question-ans obj
qa_obj = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

#### Was getting several issues with RetrievalQA.from_chain_type retriever. Issue got resolved after upgraded langchain version to 0.1.10 (any version > 0.1.0 will suffice)

In [48]:
# Function to print results cleanly (fixing the repeating statement)
def print_result(result):
    if 'result' in result:
        print("Result: ", result['result'])
    # if 'source_documents' in result:
    #     print("\nSource Documents:")
    #     for doc in result['source_documents']:
    #         print(f"- {doc.metadata['source']}")

In [49]:
# Question answring session
# while True:
user_input = input("Ask your query related to general medicine and disease: ")
result = qa_obj.invoke({"query": user_input})
print_result(result)
# print("Result: ", result['result'])

Result:  Breast cancer is a st cancer is a s t cancer is ast cancer is a st cancer is a st cancer is astal mass cancer occurs when we can beast cancer is a s tcancerhaease cancer,  Breast cancer is a st cancer is ast cancer is ast cancer is a s t cancer is a s t cancer is a s t cancer is a s t cancer is ast cancer is ast cancer is aSt Cancerrast cancer is a st cancer is a st cancer is ast cancer is ast cancer is a s t cancer is a s t cancer is a s t cancer is a st cancer is ast cancer is ast cancer is a st cancer is a st cancer is ast cancer is ast cancer,
Breast cancer is aSt Cancerrast cancer is ast cancer is ast cancer is a s t cancer is a s t cancer is a st cancer is a s tcancerha
