In [184]:
# Importing required libraries
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import os

In [185]:
# Adding api key
PINECONE_API_KEY = "3f0e285f-3731-453e-9c68-d4a4563070b5"
PINECONE_INDEX_NAME = "medi-chat-2024"

### Extract data and create embedding vectors

In [186]:
# Creating a loader to load pdf data
def load_pdf_data(data_path):
    loader = DirectoryLoader(
        path=data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )

    docs = loader.load()
    return docs

In [187]:
pdf_docs = load_pdf_data("data")

100%|██████████| 1/1 [00:14<00:00, 14.61s/it]


In [188]:
pdf_docs[40]

Document(page_content='The symptoms of CO poisoning in order of increas-\ning severity include:\n• headache\n• shortness of breath\n• dizziness\n• fatigue• mental confusion and difficulty thinking\n• loss of fine hand-eye coordination\n• nausea and vomiting• rapid heart rate\n• hallucinations\n• inability to execute voluntary movements accurately• collapse\n• lowered body temperature ( hypothermia )\n• coma• convulsions• seriously low blood pressure\n• cardiac and respiratory failure\n• death\nIn some cases, the skin, mucous membranes, and\nnails of a person with CO poisoning are cherry red orbright pink. Because the color change doesn’t alwaysoccur, it is an unreliable symptom to rely on for diagnosis.\nAlthough most CO poisoning is acute, or sudden, it is\npossible to suffer from chronic CO poisoning. This condi-tion exists when a person is exposed to low levels of the gasover a period of days to months. Symptoms are often vagueand include (in order of frequency) fatigue, headache,di

In [189]:
# Splitting the data into chunks
def get_text_chunks(data):
    # Initialize the text splitter class
    extracted_chunks = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=30,
    )
    # Get the split text/chunks using split_documents
    doc_split = extracted_chunks.split_documents(data)
    return doc_split

In [190]:
doc_chunks = get_text_chunks(pdf_docs)
doc_chunks[14]

Document(page_content='Volume 5: T-Z ........................................ 3237\nOrganizations ............................................ 3603\nGeneral Index ............................................ 3625\nGALE ENCYCLOPEDIA OF MEDICINE 2 VCONTENTS', metadata={'source': 'data\\medical-book.pdf', 'page': 3})

In [191]:
# Initializing the embedding model
def get_hugging_face_embedding():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [192]:
minilm_embedding = get_hugging_face_embedding()

In [193]:
# Adding pinecone api key to script environment
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [194]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = PINECONE_INDEX_NAME

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 


In [195]:
# Function to create a vector store in pinecone
# vectorstore = PineconeVectorStore.from_documents(doc_chunks, embedding=minilm_embedding, index_name=PINECONE_INDEX_NAME)
# Getting the vector details
vector_details = pc.Index(index_name).describe_index_stats()
print(vector_details)
def get_pinecone_vectorestore(doc_chunks, embedding, index_name):
    if vector_details['total_vector_count'] == 0:
        vectorstore = PineconeVectorStore.from_documents(doc_chunks, embedding=embedding, index_name=index_name)
    else:
        vectorstore = PineconeVectorStore(index_name=index_name, embedding=embedding)

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}


In [196]:
vectorstore = get_pinecone_vectorestore(doc_chunks=doc_chunks, embedding=minilm_embedding, index_name=PINECONE_INDEX_NAME)

In [197]:
ex_query = "What is Acne"
similar_search = vectorstore.similarity_search(query=ex_query)
similar_search

AttributeError: 'NoneType' object has no attribute 'similarity_search'

In [None]:
# Adding LLM prompt engineering
llm_prompt = """You are a knowledgeable assistant. Based on the provided context, answer the following question succinctly and clearly.
Context: {context}
Question: {question}
Answer:
"""


In [None]:
# Creating a prompt template
prompt_template = PromptTemplate(template=llm_prompt, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": prompt_template}


In [None]:
# Instantiate llama-2 llm model
llm = CTransformers(model="model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={'max_new_tokens':500,
                            'temperature': 0.3}
                    )

In [None]:
# Creating question-ans obj
qa_obj = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

#### Was getting several issues with RetrievalQA.from_chain_type retriever. Issue got resolved after upgraded langchain version to 0.1.10 (any version > 0.1.0 will suffice)

In [None]:
# Function to print results cleanly (fixing the repeating statement)
def print_result(result):
    if 'result' in result:
        print("Result: ", result['result'])
    if 'source_documents' in result:
        print("\nSource Documents:")
        for doc in result['source_documents']:
            print(f"- {doc.metadata['source']}")

In [None]:
# Question answring session
# while True:
user_input = input("Ask your query related to general medicine and disease: ")
result = qa_obj.invoke({"query": user_input})
print_result(result)
# print("Result: ", result['result'])

KeyboardInterrupt: 