In [4]:
import os
os.chdir("../")

In [6]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
# Extract text from pdf
def load_pdf_files(file_path):
    loader = DirectoryLoader(
        file_path,
        glob = "*.pdf",
        loader_cls = PyPDFLoader
    )

    documents = loader.load()
    return documents

In [8]:
extracted_data = load_pdf_files("data")

In [11]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content = doc.page_content,
                metadata = {"source" : src}
            )
        )
    return minimal_docs

In [12]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [19]:
# Split the document into smaller chunks

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20,
        length_function = len
    )
    texts = text_splitter.split_documents(minimal_docs)
    return texts

In [21]:
text_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(text_chunk)}")

Number of chunks: 8709


In [25]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name = model_name,
    )
    return embeddings

embedding = download_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
vector = embedding.embed_query("hello world")
print(len(vector))

384


In [30]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [67]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY


In [68]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key = pinecone_api_key)

In [69]:
pc

<pinecone.pinecone.Pinecone at 0x25ee4d25c60>

In [70]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384, #Dimension of embeddings
        metric = "cosine",
        spec = ServerlessSpec(cloud = "aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [41]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunk,
    embedding = embedding,
    index_name = index_name
)

In [71]:
# load existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embedding
)

In [None]:
# add more data to the existing pinecone index 
# dswith = Document(
#     page_content = "",
#     metadata = {"source": ""}
# )

# docsearch.add_documents(documents=[dswith])

In [44]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs = {"k" : 3}) 

In [47]:
retrieved_docs = retriever.invoke("What is acne?")
retrieved_docs

[Document(id='a7a5945a-8180-4c8c-b3b5-4830a4def713', metadata={'source': 'data\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='rare disorder characterized by an abnormal increase in\nthe number of mature red cells in the blood.\nGALE ENCYCLOPEDIA OF MEDICINE 2 2981\nSecondary polycythemia\nThis young boy is afflicted with seborrheic dermatitis.(Cus-\ntom Medical Stock Photo. Reproduced by permission.)\nKEY TERMS\nAcne—A chronic inflammation of the sebaceous\nglands that manifests as blackheads, whiteheads,\nand/or pustules on the face or trunk.\nPsoriasis—A skin disorder of chronic, itchy scaling'),
 Document(id='1a514816-eb3a-409b-8791-ddb2d155f3f6', metadata={'source': 'data\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='matologist will attempt to rule out a number of other dis-\neases that have similar symptoms. Acne vulgaris is per-\nhaps the disorder most commonly mistaken for rosacea,\nbut redness and spider-like veins are not observed in\npatie

In [72]:
from langchain_google_genai import ChatGoogleGenerativeAI

chatModel = ChatGoogleGenerativeAI(
    model = "models/gemini-2.0-flash",  
    google_api_key = GOOGLE_API_KEY
)


In [49]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [50]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer"
    "the question. If you don't know the answer, say that you"
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [73]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [74]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

Acne is a chronic inflammation of the sebaceous glands. It manifests as blackheads, whiteheads, and/or pustules on the face or trunk. Acne vulgaris is commonly mistaken for rosacea.


In [75]:
response = rag_chain.invoke({"input": "what is the Treatment of Acne?"})
print(response["answer"])

Acne is treated with antibiotics, antiandrogens, and other drugs such as retinoic acids (vitamin A compounds). Topical vitamin A derivatives may also have a role in the treatment of rosacea. Isotretinoin, a powerful vitamin A derivative, is also used in the treatment of acne.


In [76]:
response = rag_chain.invoke({"input": "what is  supranuclear palsy?"})
print(response["answer"])

Progressive supranuclear palsy is a disease that affects middle-aged individuals, typically starting in their 60s. It is characterized by the loss of nerve cells, leading to palsy or paralysis that worsens over time. This condition impacts eye movement, muscle relaxation, and balance control.
