In [1]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from collections import defaultdict

In [None]:
SECTION_HEADERS = [
    "Overview", "Symptoms", "Causes", "Diagnosis", "Treatment",
    "Prevention", "Complications"
]

def split_sections(text, disease_name):

    pattern = "|".join([re.escape(sec) for sec in SECTION_HEADERS])
    matches = list(re.finditer(pattern, text, flags=re.IGNORECASE))

    sections = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        raw_title = matches[i].group(0).strip()

        normalized_title = next(
            (s for s in SECTION_HEADERS if s.lower() == raw_title.lower()),
            raw_title
        )

        section_text = text[start:end].strip()

        if section_text:
            sections.append(Document(
                page_content=section_text,
                metadata={"disease": disease_name, "section": normalized_title}
            ))
    return sections


In [3]:
pdf_files = {
    # "Addison's_disease.pdf": "Addison's Disease",
    "Acute_pancreatitis.pdf": "Acute Pancreatitis"
}

In [None]:
all_chunks = []

for file_name, disease_name in pdf_files.items():
    loader = PyPDFLoader(os.path.join("data", file_name))
    docs = loader.load()

    full_text = "\n".join([doc.page_content for doc in docs])
    sectioned_docs = split_sections(full_text, disease_name)

    section_counts = defaultdict(int)
    for doc in sectioned_docs:
        key = (doc.metadata["disease"], doc.metadata["section"])
        section_counts[key] += 1

    for (disease, section), count in section_counts.items():
        print(f"{disease} -> {section}: {count} chunk(s)")

    all_chunks.extend(sectioned_docs)

print(f"Created {len(all_chunks)} smart chunks")

Acute Pancreatitis -> Overview: 1 chunk(s)
Acute Pancreatitis -> Complications: 8 chunk(s)
Acute Pancreatitis -> Symptoms: 7 chunk(s)
Acute Pancreatitis -> Causes: 8 chunk(s)
Acute Pancreatitis -> Treatment: 7 chunk(s)
Acute Pancreatitis -> Diagnosis: 2 chunk(s)
Acute Pancreatitis -> Prevention: 1 chunk(s)
Created 34 smart chunks


In [5]:
print(all_chunks)

[Document(metadata={'disease': 'Acute Pancreatitis', 'section': 'Overview'}, page_content='Overview  \nAcute  pancreatitis  is  a  condition  where  the  pancreas  becomes  inflamed  (swollen)  over  a  \nshort\n \nperiod\n \nof\n \ntime.\n  The  pancreas  is  a  small  organ,  located  behind  the  stomach,  that  helps  with  digestion.   Most  people  with  acute  pancreatitis  start  to  feel  better  within  about  a  week  and  have  no  \nfurther\n \nproblems.\n \nBut\n \nsome\n \npeople\n \nwith\n \nsevere\n \nacute\n \npancreatitis\n \ncan\n \ngo\n \non\n \nto\n \ndevelop\n \nserious'), Document(metadata={'disease': 'Acute Pancreatitis', 'section': 'Complications'}, page_content='complications.\n  Acute  pancreatitis  is  different  to  chronic  pancreatitis,  where  the  pancreas  has  become  \npermanently\n \ndamaged\n \nfrom\n \ninflammation\n \nover\n \nmany\n \nyears.'), Document(metadata={'disease': 'Acute Pancreatitis', 'section': 'Symptoms'}, page_content='Symptoms  o