In [1]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
from collections import defaultdict
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader

In [2]:
from config import OPENAI_API_KEY, EMBEDDING_MODEL, LLM_MODEL, FAISS_DIR, PDF_DIR
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [3]:
SECTION_HEADERS = [
    "Overview", "Symptoms", "Causes", "Diagnosis", "Treatment",
    "Prevention", "Complications"
]

def split_sections(text, disease_name):

    pattern = "|".join([re.escape(sec) for sec in SECTION_HEADERS])
    matches = list(re.finditer(pattern, text, flags=re.IGNORECASE))

    sections = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        raw_title = matches[i].group(0).strip()

        normalized_title = next(
            (s for s in SECTION_HEADERS if s.lower() == raw_title.lower()),
            raw_title
        )

        section_text = text[start:end].strip()

        if section_text:
            sections.append(Document(
                page_content=section_text,
                metadata={"disease": disease_name, "section": normalized_title}
            ))
    return sections

In [4]:
def deduplicate_chunks(docs):
    seen = set()
    unique_docs = []
    for doc in docs:
        text = doc.page_content.strip()
        if text not in seen:
            seen.add(text)
            unique_docs.append(doc)
    return unique_docs

In [5]:
def split_sections(text, disease_name):
    pattern = "|".join([re.escape(sec) for sec in SECTION_HEADERS])
    matches = list(re.finditer(pattern, text, flags=re.IGNORECASE))

    sections = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        raw_title = matches[i].group(0).strip()

        normalized_title = next(
            (s for s in SECTION_HEADERS if s.lower() == raw_title.lower()),
            raw_title
        )

        section_text = text[start:end].strip()

        if section_text:
            sections.append(Document(
                page_content=section_text,
                metadata={
                    "disease": disease_name,
                    "section": normalized_title,
                    "source": f"{disease_name}.pdf"
                }
            ))
    return sections


In [6]:
def load_and_chunk_pdfs(pdf_folder="data"):
    all_chunks = []
    for file_name in os.listdir(pdf_folder):
        if file_name.endswith(".pdf"):
            disease_name = file_name.replace("_", " ").replace(".pdf", "")
            loader = PyPDFLoader(os.path.join(pdf_folder, file_name))
            docs = loader.load()
            full_text = "\n".join([doc.page_content for doc in docs])
            sectioned_docs = split_sections(full_text, disease_name)
            all_chunks.extend(sectioned_docs)

    unique_chunks = deduplicate_chunks(all_chunks)
    return unique_chunks

In [None]:
embedding_model = OpenAIEmbeddings(model=EMBEDDING_MODEL)

In [9]:
if not os.path.exists(f"{FAISS_DIR}/index.faiss"):
    print("Building FAISS index...")
    all_chunks = load_and_chunk_pdfs()
    vectorstore = FAISS.from_documents(all_chunks, embedding_model)
    vectorstore.save_local(FAISS_DIR)
else:
    print("Loading existing FAISS index...")
    vectorstore = FAISS.load_local(
        FAISS_DIR,
        embedding_model,
        allow_dangerous_deserialization=True
    )

Loading existing FAISS index...


In [10]:
llm = ChatOpenAI(
    model=LLM_MODEL,
    temperature=0
)

from langchain.chains import RetrievalQAWithSourcesChain

qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

query = "What causes acute pancreatis? only give cause and no discription include minor causes as well"
response = qa_chain.invoke({"question": query})
print(response["answer"])
print("Sources:", response["sources"])


Causes of acute pancreatitis include gallstones, alcohol consumption, high blood fat levels (hypertriglyceridaemia), accidental damage or injury to the pancreas, viruses like mumps or measles, high blood calcium levels (hypercalcaemia), autoimmune reactions, and certain medications. Minor causes also include family history of pancreatitis and smoking.


Sources: Acute pancreatitis.pdf
