In [62]:
# Importing required libraries
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import os

In [63]:
# Adding api key
PINECONE_API_KEY = "48846a07-d3ce-48fb-b187-16360f241270"   # Not required in latest code
PINECONE_INDEX_NAME = "chatbot-medicine"

### Extract data and create embedding vectors

In [64]:
# Creating a loader to load pdf data
def load_pdf_data(data_path):
    loader = DirectoryLoader(
        path=data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )

    docs = loader.load()
    return docs

In [65]:
pdf_docs = load_pdf_data("data")

100%|██████████| 1/1 [00:16<00:00, 16.22s/it]


In [66]:
pdf_docs[40]

Document(page_content='The symptoms of CO poisoning in order of increas-\ning severity include:\n• headache\n• shortness of breath\n• dizziness\n• fatigue• mental confusion and difficulty thinking\n• loss of fine hand-eye coordination\n• nausea and vomiting• rapid heart rate\n• hallucinations\n• inability to execute voluntary movements accurately• collapse\n• lowered body temperature ( hypothermia )\n• coma• convulsions• seriously low blood pressure\n• cardiac and respiratory failure\n• death\nIn some cases, the skin, mucous membranes, and\nnails of a person with CO poisoning are cherry red orbright pink. Because the color change doesn’t alwaysoccur, it is an unreliable symptom to rely on for diagnosis.\nAlthough most CO poisoning is acute, or sudden, it is\npossible to suffer from chronic CO poisoning. This condi-tion exists when a person is exposed to low levels of the gasover a period of days to months. Symptoms are often vagueand include (in order of frequency) fatigue, headache,di

In [67]:
# Splitting the data into chunks
def get_text_chunks(data):
    # Initialize the text splitter class
    extracted_chunks = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=30,
    )
    # Get the split text/chunks using split_documents
    doc_split = extracted_chunks.split_documents(data)
    return doc_split

In [68]:
doc_chunks = get_text_chunks(pdf_docs)
doc_chunks[14]

Document(page_content='The Gale Encyclopedia of Medicine 2 is a medical ref-', metadata={'source': 'data\\medical-book.pdf', 'page': 4})

In [69]:
# Initializing the embedding model
def get_hugging_face_embedding():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [70]:
minilm_embedding = get_hugging_face_embedding()

In [71]:
# Adding pinecone api key to script environment
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [72]:
# Creating a vector store in pinecone
vectorstore = PineconeVectorStore.from_documents(doc_chunks, embedding=minilm_embedding, index_name=PINECONE_INDEX_NAME)

In [73]:
# vectorstore_local = vectorstore

In [74]:
ex_query = "What is Acne"
similar_search = vectorstore.similarity_search(query=ex_query)
similar_search

[Document(page_content='latex items such as glovesand condoms; and formaldehyde. Many people find thatthey are allergic to the nickel in inexpensive jewelry. ACDis usually confined to the area of skin that comes in contactwith the allergen, typically the hands or face. Symptomsrange from mild to severe and resemble those of ICD; apatch test may be needed to determine which kind of con-tact dermatitis a person is suffering from.', metadata={'page': 297.0, 'source': 'data\\medical-book.pdf'}),
 Document(page_content='repeated exposure to an allergen (an allergy-causing sub-stance) triggers an immune response that inflames theskin. Tens of thousands of drugs, pesticides, cosmetics,food additives, commercial chemicals, and other sub-stances have been identified as potential allergens. Fewerthan 30, however, are responsible the majority of ACDcases. Common culprits include poison ivy, poison oak,and poison sumac; fragrances and preservatives in cosmet-ics and personal care products; latex i

In [75]:
# Adding LLM prompt engineering
llm_prompt = """You are a knowledgeable assistant. Based on the provided context, answer the following question succinctly and clearly.
Context: {context}
Question: {question}
Answer:
"""


In [76]:
# Creating a prompt template
prompt_template = PromptTemplate(template=llm_prompt, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": prompt_template}


In [86]:
# Instantiate llama-2 llm model
llm = CTransformers(model="model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={'max_new_tokens':100,
                            'temperature': 0.3}
                    )

In [87]:
# Creating question-ans obj
qa_obj = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

#### Was getting several issues with RetrievalQA.from_chain_type retriever. Issue got resolved after upgraded langchain version to 0.1.10 (any version > 0.1.0 will suffice)

In [88]:
# Function to print results cleanly (fixing the repeating statement)
def print_result(result):
    if 'result' in result:
        print("Result: ", result['result'])
    if 'source_documents' in result:
        print("\nSource Documents:")
        for doc in result['source_documents']:
            print(f"- {doc.metadata['source']}")

In [89]:
# Question answring session
try:
    while True:
        user_input = input("Ask your query related to general medicine and disease: ")
        result = qa_obj.invoke({"query": user_input})
        print_result(result)
        # print("Result: ", result['result'])
except KeyboardInterrupt:
    print("Query ended good bye !")

Result:  The best way of course, The ayur dermat into the following remedy of course,
Herbalmild cases of course of course of course of course of course of course of course of course of course of course of course of course of course of course,
Ayogaroma)  Herbalancedotes, an experienced herbalmild case of course of course of course of course of course of course of course of course of course of course of course of course of course

Source Documents:
- data\medical-book.pdf
- data\medical-book.pdf
- data\medical-book.pdf
- data\medical-book.pdf
Query ended good bye !
