In [None]:
import PyPDF2
import pytesseract
import pypdfium2 as pdfium
from pathlib import Path
import re
from typing import Dict, List, Optional



In [49]:
def perform_ocr_on_pdf(pdf_path: Path, dpi: int = 300, tesseract_cmd = None) -> str:
    if tesseract_cmd:
        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
        
    pdf = pdfium.PdfDocument(str(pdf_path))
    text_pages = []
    scale = int(dpi / 72)

    for page in pdf:
        bitmap = page.render(scale = scale)
        img = bitmap.to_pil()
        text_pages.append(pytesseract.image_to_string(img, config='--psm 6'))

    return '\n'.join(text_pages)

def read_pdf_text(pdf_path: Path) -> str:
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    if not text.strip():
        text = perform_ocr_on_pdf(pdf_path, tesseract_cmd=r"C:\Users\moazzam\AppData\Local\Programs\Tesseract-OCR\tesseract.exe")
        
    return text

In [50]:
pdfs_text = []
pdf_directory = Path(r"data\soap_notes")

for pdf in pdf_directory.glob("*.pdf"):
    pdf_text = read_pdf_text(pdf)
    pdfs_text.append({
        "file_name": pdf.name,
        "text": pdf_text
    })

In [51]:
print(pdfs_text[0]["text"])

2/3/26, 8:10 PM eCW (Mehmood, Mudasar )
GUTIB, Gion DOB: 04/19/2025 (9 mo M) Acc No. 23565 DOS: 1/26/2026
. ; GUTIB, Gion
: Wy children’s
ad medical group
01/26/2026 ProgressNotes: Humeraa Qamar, MD
Current Medications Reason for Appointment
Not-Taking ; 1. 9 month wellness, possible vaccines
* Mometasone Furoate 0.1% Ointment 1 2. Breastfeed every 4-5 hrs and eating puree and solids foods 2-3 times a
application Externally twice a day day
3. Well Child Examination - EPSDT - 9 months
Screening
History of Present Illness
Review of Systems Interval History:
General/Constitutional: Lives with: parents . Family support: yes, partner involved with care
Ophthulmolaeie . Concerns/Questions: none . Primary care giver: mother . Interim
“Discharge denies. Vision screen fixes and Illness: none . Sleep: sleeps through the night , () hours per night , ()
follows, parent reports no concern. hours nap time during the day , no problems reported . Sees/Hears: well -
ENT: as reported by parent , eyes st

In [None]:
# =========================
# Normalization
# =========================
def normalize_text(text: str) -> str:
    text = text.replace("\u2013", "-")
    text = text.replace("\xa0", " ")
    text = re.sub(r"[ ]{2,}", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()


# =========================
# Section Extractor
# =========================
def extract_section(text: str, start: str, stops: List[str]) -> Optional[str]:
    stop_pattern = "|".join(map(re.escape, stops))
    pattern = rf"{start}(.+?)(?:{stop_pattern}|$)"
    m = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
    return m.group(1) if m else None


# =========================
# Generic helper
# =========================
def extract_single(pattern: str, text: str) -> Optional[str]:
    m = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
    return m.group(1).strip() if m else None


# =========================
# Extractors
# =========================
def extract_patient_name(text: str) -> Optional[str]:
    return extract_single(
        r"^([A-Z][A-Z\-']+,\s*[A-Z][A-Za-z\-']+(?:\s+[A-Z])?)(?=\s+DOB)",
        text
    )


def extract_dob(text: str) -> Optional[str]:
    dob = extract_single(
        r"DOB:\s*[@]?\s*(\d{1,2}/\d{1,2}/\d{4})",
        text
    )
    if not dob:
        return None

    try:
        m, d, y = map(int, dob.split("/"))
        if 1 <= m <= 12 and 1 <= d <= 31 and y >= 1900:
            return dob
    except ValueError:
        pass

    return None


def extract_age(text: str) -> Optional[str]:
    """
    Returns age WITH unit: e.g. '17 yo', '9 mo', '16 wo', '3 d'
    """
    m = re.search(
        r"\((\d+)\s*(yo|yrs?|mo|wo|days?|d)\b"
        r"|(\d+)\s*Y\s*old"
        r"|(\d+)\s*(days?|d)\s*old",
        text,
        re.IGNORECASE
    )
    if not m:
        return None

    if m.group(1):
        return f"{m.group(1)} {m.group(2)}"
    if m.group(3):
        return f"{m.group(3)} yo"
    if m.group(4):
        return f"{m.group(4)} {m.group(5)}"

    return None


def extract_dos(text: str) -> Optional[str]:
    return extract_single(
        r"DOS:\s*(\d{1,2}/\d{1,2}/\d{4})",
        text
    )


# =========================
# ICDs from Assessments only
# =========================
def extract_icd_codes(text: str) -> List[str]:
    assessments = extract_section(
        text,
        start="Assessments",
        stops=[
            "Treatment",
            "Procedure Codes",
            "Follow Up",
            "Care Plan",
            "Electronically signed"
        ]
    )

    if not assessments:
        return []

    icds = re.findall(
        r"\b[A-TV-Z][0-9]{2}(?:\.[0-9A-TV-Z]{1,4})?\b",
        assessments
    )
    return sorted(set(icds))


# =========================
# CPTs from Procedure Codes only
# =========================
def extract_cpt_codes(text: str) -> List[str]:
    procedures = extract_section(
        text,
        start="Procedure Codes",
        stops=[
            "Follow Up",
            "Care Plan",
            "Electronically signed",
            "Generated for Printing"
        ]
    )

    if not procedures:
        return []

    cpts = re.findall(r"\b(?:\d{5}|[A-Z]\d{4})\b", procedures)
    return sorted(set(cpts))


# =========================
# Consolidated
# =========================
def extract_demographics(text: str) -> Dict:
    text = normalize_text(text)
    return {
        "patient_name": extract_patient_name(text),
        "dob": extract_dob(text),
        "age": extract_age(text),
        "dos": extract_dos(text),
        "icd_codes": extract_icd_codes(text),
        "cpt_codes": extract_cpt_codes(text),
    }


In [53]:
all_data = []

for pdf in Path("data/soap_notes").rglob("*.pdf"):
    raw_text = read_pdf_text(pdf)
    normalized_text = normalize_text(raw_text)
    print(normalized_text)
    demographic = extract_demographics(normalized_text)
    all_data.append(demographic)

2/3/26, 8:10 PM eCW (Mehmood, Mudasar )
GUTIB, Gion DOB: 04/19/2025 (9 mo M) Acc No. 23565 DOS: 1/26/2026
. ; GUTIB, Gion
: Wy children’s
ad medical group
01/26/2026 ProgressNotes: Humeraa Qamar, MD
Current Medications Reason for Appointment
Not-Taking ; 1. 9 month wellness, possible vaccines
* Mometasone Furoate 0.1% Ointment 1 2. Breastfeed every 4-5 hrs and eating puree and solids foods 2-3 times a
application Externally twice a day day
3. Well Child Examination - EPSDT - 9 months
Screening
History of Present Illness
Review of Systems Interval History:
General/Constitutional: Lives with: parents . Family support: yes, partner involved with care
Ophthulmolaeie . Concerns/Questions: none . Primary care giver: mother . Interim
“Discharge denies. Vision screen fixes and Illness: none . Sleep: sleeps through the night , () hours per night , ()
follows, parent reports no concern. hours nap time during the day , no problems reported . Sees/Hears: well -
ENT: as reported by parent , eyes st

In [54]:
all_data

[{'patient_name': 'GUTIB, Gion',
  'dob': '04/19/2025',
  'age': '9 mo',
  'dos': '1/26/2026',
  'icd_codes': [],
  'cpt_codes': []},
 {'patient_name': 'GRIESMEYER, Gunner',
  'dob': '09/30/2025',
  'age': '16 wo',
  'dos': '01/26/2026',
  'icd_codes': ['K59.00'],
  'cpt_codes': []},
 {'patient_name': 'GIBBS, Kayleigh',
  'dob': None,
  'age': '18 yo',
  'dos': '01/26/2026',
  'icd_codes': ['Z68.53', 'Z71.3', 'Z71.82'],
  'cpt_codes': ['85018', '96127', '97802', 'G0447', 'G8510']},
 {'patient_name': 'FLORES, Melanie S',
  'dob': '03/18/2008',
  'age': '17 yo',
  'dos': '01/26/2026',
  'icd_codes': ['Z71.3'],
  'cpt_codes': ['96372']}]