### Importing Required Libraries

In [None]:
import os
import re
from datetime import datetime

import pandas as pd
from langchain_community.document_loaders import PyPDFLoader


In [20]:
soap_note = "SOAP notes\Andrew Mallette.pdf"

In [None]:
def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    document = loader.load()
    
    text_content = "\n".join([doc.page_content for doc in document])
    
    return text_content

In [None]:
def extract_icd10_from_assessment(text: str) -> list:
    """
    Return a list of ICD-10 codes found in the Assessment section.
    Falls back to full-text scan if the Assessment block boundary can't be found cleanly.
    """

    start_match = re.search(r'(?mi)^\s*Assessment\s*$', text)
    if start_match:
        start = start_match.end()
        end_match = re.search(
            r'(?mi)^\s*(Plan|Orders|Medications(?:\s+attached.*)?|Screenings/.*|Observations|Quality of care|Care plan)\b.*$',
            text[start:]
        )
        if end_match:
            end = start + end_match.start()
            scope = text[start:end]
        else:
            scope = text[start:] 
    else:

        scope = text

    bracket_chunks = re.findall(r'\[ICD-10:\s*([^\]]+)\]', scope, flags=re.IGNORECASE)

    code_pattern = re.compile(r'\b[A-TV-Z][0-9]{2}[A-Z0-9]?(?:\.[A-Z0-9]{1,4})?\b', re.IGNORECASE)

    codes = []
    seen = set()
    for chunk in bracket_chunks:
        for code in code_pattern.findall(chunk):
            code = code.upper()
            if code not in seen:
                seen.add(code)
                codes.append(code)

    return codes


In [None]:
def extract_patient_info(text: str) -> dict:
    data = {}

    # --- Extract Name ---
    name_match = re.search(
        r"Patient[:\s]+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)+)(?=\s+DOB|\s+PRN|\s*$)",
        text
    )
    if not name_match:
        # Fallback: try FIRST + LAST NAME
        first = re.search(r"FIRST NAME\s+([A-Za-z]+)", text)
        last = re.search(r"LAST NAME\s+([A-Za-z]+)", text)
        if first and last:
            data["Patient Name"] = f"{first.group(1)} {last.group(1)}"
    else:
        data["Patient Name"] = name_match.group(1)


    # --- Extract DOB ---
    dob_match = re.search(r"(?:DOB|DATE OF BIRTH)[:\s]+(\d{2}/\d{2}/\d{4})", text)
    if dob_match:
        data["DOB"] = dob_match.group(1)

    # --- Extract DOS (Date of Service) ---
    dos_match = re.search(r"Date of service[:\s]+(\d{2}/\d{2}/\d{2,4})", text, re.IGNORECASE)
    if dos_match:
        raw_dos = dos_match.group(1)
        # Normalize to DD-MM-YY
        try:
            dt = datetime.strptime(raw_dos, "%m/%d/%y")  # e.g. 08/26/25
        except ValueError:
            dt = datetime.strptime(raw_dos, "%m/%d/%Y") # e.g. 08/26/2025
        data["DOS"] = dt.strftime("%d-%m-%y")

    # --- Extract Member ID ---
    member_id_match = re.search(r"INSURED ID NUMBER\s+([A-Z0-9]+)", text)
    if member_id_match:
        data["Member ID"] = member_id_match.group(1)

    # --- Extract Insurance Provider ---
    payer_match = re.search(r"PAYER\s+([A-Za-z0-9& ]+)", text)
    if payer_match:
        data["Insurance"] = payer_match.group(1).strip()
        
    data["ICD Codes"] = ", ".join(extract_icd10_from_assessment(text))
    return data


In [64]:
dir = "SOAP notes"
phi_dict = []
for file in os.listdir(dir):
    if file.endswith(".pdf"):
        soap_note = os.path.join(dir, file)
        text_content = load_pdf(soap_note)
        phi = extract_patient_info(text_content)
        phi_dict.append(phi)

phi_df = pd.DataFrame(phi_dict)
phi_df.insert(0, "Facility Name", "Cognitive Works")
phi_df

Unnamed: 0,Facility Name,Patient Name,DOB,DOS,Member ID,Insurance,ICD Codes
0,Cognitive Works,Andrew Mallette,05/19/1980,26-08-25,908250976,United Healthcare,"F29, F41.1"
1,Cognitive Works,Anu Kaul,12/12/1961,26-08-25,W226343414,Aetna,"F31.81, N18.9, M10.9"
2,Cognitive Works,Brandon Beller,11/23/1993,26-08-25,W287990420,Aetna,"F42.9, F41.1"
3,Cognitive Works,Cheyenne Rae Tretter,09/14/1996,26-08-25,W261712616,Aetna,"F31.81, F10.21, F12.21"
4,Cognitive Works,Jelena Mihailovic,06/07/1994,26-08-25,W287373691,Aetna,"F33.9, F41.1"
5,Cognitive Works,Mikaela Wolfkamp,12/12/1982,26-08-25,931282160,United Healthcare,F33.9
