In [95]:
import PyPDF2
from pathlib import Path
import re

In [3]:
def extract_text_from_pdf(file_path):
    reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [70]:
text = []

for pdf_file in Path("data").rglob("*.pdf"):
    text.append(extract_text_from_pdf(pdf_file))

In [126]:
def extract_dos(text):
    pattern = r"Date and[^\d]*(\d{1,2}/\d{1,2}/\d{4})"
    matches = re.findall(pattern, text)
    return matches[0] if matches else None


def extract_clinician(text):
    pattern = r"Clinician\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    match = re.search(pattern, text)
    if match:
        return match.group(1).split(",")[0].strip()
    return None


def extract_supervisor(text):
    pattern = r"Supervisor\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    match = re.search(pattern, text)
    if match:
        return match.group(1).split(",")[0].strip()
    return None


def extract_patient(text):
    pattern = r"Patient\s*[:\-]\s*([A-Za-z\s]+)"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else None


def extract_dob(text):
    pattern = r"DOB\s*[:\-]?\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else None


def extract_duration(text):
    pattern = r"Duration\s*[:\-]\s*([0-9]+ ?minutes)"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else None


def extract_service_code(text):
    pattern = r"Service Code\s*[:\-]\s*(\d{5})"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else None


def extract_location(text):
    pattern = r"Location\s*[:\-]\s*([A-Za-z0-9\s\-]+)"
    match = re.search(pattern, text)
    return match.group(1).split("\n")[0].strip() if match else None


def extract_participants(text):
    pattern = r"Participants\s*[:\-]\s*([A-Za-z\s;]+)"
    match = re.search(pattern, text)
    return match.group(1).split("\n")[0].strip() if match else None


def extract_diagnosis(text):
    pattern = r"Diagnosis\s*[:\n]\s*([A-Z]\d{2}(?:\.\d+)?(?: [A-Za-z ,]+)?)"
    match = re.search(pattern, text, re.MULTILINE)
    return match.group(1).strip() if match else None


def extract_icds(text):
    icd_pattern = r"\b[A-Z]\d{2}(?:\.\d+)?\b"
    icds = re.findall(icd_pattern, text)

    unique_icds = list(dict.fromkeys(icds))

    return ", ".join(unique_icds)


def apply_pos(location):
    loc = location.strip().lower()
    if "telehealth" in loc:
        return {"POS": 10, "MODIFIER": 95}
    return {"POS": "", "MODIFIER": ""}


def extract_session_info(text):
    location = extract_location(text)
    pos_info = apply_pos(location)
    pos, modifier = pos_info["POS"], pos_info["MODIFIER"]

    service_code = extract_service_code(text)
    diagnosis = extract_diagnosis(text)
    icds = extract_icds(diagnosis)
    return {
        "Date": extract_dos(text),
        "Patient": extract_patient(text),
        "DOB": extract_dob(text),
        "Service Code": service_code,
        "Diagnosis": icds,
        "Clinician": extract_clinician(text),
        "Coding": f"{service_code}--{modifier}--{icds}",
        "Status": "On Hold",
        # "Supervisor": extract_supervisor(text),
        # "Duration": extract_duration(text),
        # "Location": location,
        # "Participants": extract_participants(text),
        "POS": pos,
        "Modifier": modifier,
        "Comments": "",
    }


In [127]:
info = []

for t in text:
    info.append(extract_session_info(t))


for i in info:
    print(i)

{'Date': '11/17/2025', 'Patient': 'Benjamin Levi Boothe', 'DOB': '8/5/1987', 'Service Code': '90791', 'Diagnosis': 'F43.22', 'Clinician': 'Noelle W ade', 'Coding': '90791----F43.22', 'Status': 'On Hold', 'POS': '', 'Modifier': '', 'Comments': ''}
{'Date': '11/26/2025', 'Patient': 'Isaac Mejia', 'DOB': None, 'Service Code': '90846', 'Diagnosis': 'F43.23', 'Clinician': 'Jeffrey Hagle', 'Coding': '90846--95--F43.23', 'Status': 'On Hold', 'POS': 10, 'Modifier': 95, 'Comments': ''}
{'Date': '11/29/2025', 'Patient': 'Oliver Zawicki', 'DOB': '5/17/2014', 'Service Code': '90837', 'Diagnosis': 'F43.24', 'Clinician': 'Rebecca Gehlke Nolan', 'Coding': '90837--95--F43.24', 'Status': 'On Hold', 'POS': 10, 'Modifier': 95, 'Comments': ''}
{'Date': '12/1/2025', 'Patient': 'Shelly Renee Marshall', 'DOB': '10/21/1990', 'Service Code': '90837', 'Diagnosis': 'F32.0', 'Clinician': 'Noelle W ade', 'Coding': '90837----F32.0', 'Status': 'On Hold', 'POS': '', 'Modifier': '', 'Comments': ''}
{'Date': '11/7/2025

In [130]:
import pandas as pd

df = pd.DataFrame(info)

df

Unnamed: 0,Date,Patient,DOB,Service Code,Diagnosis,Clinician,Coding,Status,POS,Modifier,Comments
0,11/17/2025,Benjamin Levi Boothe,8/5/1987,90791,F43.22,Noelle W ade,90791----F43.22,On Hold,,,
1,11/26/2025,Isaac Mejia,,90846,F43.23,Jeffrey Hagle,90846--95--F43.23,On Hold,10.0,95.0,
2,11/29/2025,Oliver Zawicki,5/17/2014,90837,F43.24,Rebecca Gehlke Nolan,90837--95--F43.24,On Hold,10.0,95.0,
3,12/1/2025,Shelly Renee Marshall,10/21/1990,90837,F32.0,Noelle W ade,90837----F32.0,On Hold,,,
4,11/7/2025,Simon Ma,2/16/1977,90847,F43.23,Jeffrey Hagle,90847--95--F43.23,On Hold,10.0,95.0,
