In [10]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import os

In [11]:
# Pinecone API Credentials
PINECONE_API_KEY = "pcsk_29fdB4_7hYK4c2LD1KDztd1cyuyyaSFnDTx2f3meEFy755scd77xnqF7Tt1aDg5vBttCXr"
PINECONE_API_ENV = "quickstart"

In [12]:
# Ensure the environment variables are set
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["PINECONE_ENV"] = PINECONE_API_ENV

In [13]:
# Function to load multiple PDFs
def load_pdf(directory: str):
    loader = DirectoryLoader(directory, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [14]:
# Load all PDFs in the "data" folder (make sure all books are inside this folder)
extracted_data = load_pdf("data/")

In [15]:
# Split the extracted text into chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Total text chunks:", len(text_chunks))

Total text chunks: 36138


In [None]:
# Download embeddings model
def download_hugging_face_embeddings():
    embeddings =  HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

# Initialize Pinecone
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
index_name = "medical-chatbot"

In [17]:
# Check if the index exists, otherwise create it
if index_name not in pc.list_indexes().names():
    pc.create_index(name=index_name, dimension=384, metric="cosine")


In [20]:

# Connect to the index
index = pc.Index(index_name)

# Store text embeddings in Pinecone
docsearch = Pinecone.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name,
    namespace="medical"
)

In [21]:
# Querying Pinecone Index
docsearch = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
    namespace="medical"
)

query = "What are allergies?"
docs = docsearch.similarity_search(query=query, k=3)

In [22]:
# Display results
print("\nSearch Results for:", query)
print("-" * 50)
for i, doc in enumerate(docs, 1):
    print(f"\nResult {i}:")
    print(f"Content: {doc.page_content}")
    print(f"Source: {doc.metadata.get('source', 'Not specified')}")
    print("-" * 30)



Search Results for: What are allergies?
--------------------------------------------------

Result 1:
Content: reaction. Allergic rhinitis is characterized by an itchy,
runny nose, often with a scratchy or irritated throat due
to post-nasal drip. Inflammation of the thin membrane
covering the eye (allergic conjunctivitis) causes redness,
irritation, and increased tearing in the eyes. Asthma caus-
es wheezing, coughing, and shortness of breath. Symp-
toms of food allergies depend on the tissues most sensi-
tive to the allergen and whether the allergen spread sys-
Source: data\Medical_book.pdf
------------------------------

Result 2:
Content: reaction. Allergic rhinitis is characterized by an itchy,
runny nose, often with a scratchy or irritated throat due
to post-nasal drip. Inflammation of the thin membrane
covering the eye (allergic conjunctivitis) causes redness,
irritation, and increased tearing in the eyes. Asthma caus-
es wheezing, coughing, and shortness of breath. Symp-
toms o

In [23]:
# Custom Prompt Template
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Answer only health-related questions. If it is outside this, don't respond.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [24]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}

# Load Language Model
llm = CTransformers(
    model="llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type="llama",
    config={"max_new_tokens": 512, "temperature": 0.8}
)


In [25]:
# Create QA Chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [26]:
# Interactive Loop for Question-Answering
while True:
    user_input = input(f"Input Prompt:")
    result = qa({"query": user_input})
    print("Response:", result["result"])

  result = qa({"query": user_input})


Response: Malaria is a disease caused by a parasitic infection that can cause serious symptoms including fever, chills, headache, body aches, nausea, and fatigue. It is typically spread through the bite of an infected female mosquito. In treatment, when the risk of recurrence is less likely, antimalarial drugs are used to kill the parasite and cure the infection.


KeyboardInterrupt: Interrupted by user