In [95]:
import PyPDF2
from pathlib import Path
import re

In [3]:
def extract_text_from_pdf(file_path):
    reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [70]:
text = []

for pdf_file in Path("data").rglob("*.pdf"):
    text.append(extract_text_from_pdf(pdf_file))

In [13]:
text[2]

'Diagnosis\nF43.24 Adjustment Disorder , With disturbance of conduct\nOliver ’s emotional and behavioral responses,including irritability , frustration, and outbursts, appear to be maladaptive reactions to  \nstressors, resulting in significant impairment in social and academic functioning. His conduct disturbances (e.g., emotional outbursts,  \ndifficulty following expectations, and oppositional behaviors) are not better explained by another mental disorder and have emerged in  \nthe context of adjustment to new environmental demands. Further assessment is needed to determine if the symptoms persist beyond  \nthe expected period of adjustment or if another primary diagnosis is more appropriate.\nCurrent Mental Status\nOrientation :X3: Oriented to Person, Place, and\nTime\nGeneral Appearance :Appropriate\nDress :Appropriate\nMotor Activity :Unremarkable\nInterview Behavior :Appropriate\nSpeech :Normal\nMood :Irritable\nAffect:AppropriateInsight :Good\nJudgment/Impulse Control :Good\nMe

In [96]:
def extract_dos(text):
    phrase = "Date and"
    pattern = rf"{re.escape(phrase)}[^\d]*(\d{{1,2}}/\d{{1,2}}/\d{{4}})"
    matches = re.findall(pattern, text)
    return list(dict.fromkeys(matches))[0] if matches else None


def extract_session_info(text):
    # Patterns
    clinician_pattern = r"Clinician\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    supervisor_pattern = r"Supervisor\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    patient_pattern = r"Patient\s*[:\-]\s*([A-Za-z\s]+)"
    dob_pattern = r"DOB\s*[:\-]?\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})"
    duration_pattern = r"Duration\s*[:\-]\s*([0-9]+ ?minutes)"
    service_code_pattern = r"Service Code\s*[:\-]\s*(\d{5})"
    location_pattern = r"Location\s*[:\-]\s*([A-Za-z0-9\s\-]+)"
    participants_pattern = r"Participants\s*[:\-]\s*([A-Za-z\s;]+)"
    diagnosis_pattern = r"Diagnosis\s*[:\n]\s*([A-Z]\d{2}(?:\.\d+)?(?: [A-Za-z ,]+)?)"

    # Extract using regex
    clinician_match = re.search(clinician_pattern, text)
    supervisor_match = re.search(supervisor_pattern, text)
    patient_match = re.search(patient_pattern, text)
    dob_match = re.search(dob_pattern, text)
    date_match = extract_dos(text)
    duration_match = re.search(duration_pattern, text)
    service_code_match = re.search(service_code_pattern, text)
    location_match = re.search(location_pattern, text)
    participants_match = re.search(participants_pattern, text)
    diagnosis_match = re.search(diagnosis_pattern, text, re.MULTILINE)

    # Split and strip first item if multiple names
    clinician = (
        clinician_match.group(1).split(",")[0].strip() if clinician_match else None
    )
    supervisor = (
        supervisor_match.group(1).split(",")[0].strip() if supervisor_match else None
    )
    location = (
        location_match.group(1).split("\n")[0].strip() if location_match else None
    )
    participants = (
        participants_match.group(1).split("\n")[0].strip()
        if participants_match
        else None
    )

    result = {
        "Clinician": clinician,
        "Supervisor": supervisor,
        "Patient": patient_match.group(1).strip() if patient_match else None,
        "DOB": dob_match.group(1).strip() if dob_match else None,
        "Date": date_match if date_match else None,
        "Duration": duration_match.group(1).strip() if duration_match else None,
        "Service Code": service_code_match.group(1).strip()
        if service_code_match
        else None,
        "Location": location,
        "Participants": participants,
        "Diagnosis": diagnosis_match.group(1).strip() if diagnosis_match else None,
    }

    return result


In [100]:
info = []
for t in text:
    info.append(extract_session_info(t))

In [102]:
info[0]

{'Clinician': 'Noelle W ade',
 'Supervisor': 'Rebecca Gehlke Nolan',
 'Patient': 'Benjamin Levi Boothe',
 'DOB': '8/5/1987',
 'Date': '11/17/2025',
 'Duration': '60 minutes',
 'Service Code': '90791',
 'Location': 'In person',
 'Participants': 'Client only',
 'Diagnosis': 'F43.22 Adjustment Disorder, With anxiety'}

In [99]:
import re


def extract_dos(text):
    pattern = r"Date and[^\d]*(\d{1,2}/\d{1,2}/\d{4})"
    matches = re.findall(pattern, text)
    return matches[0] if matches else None


def extract_clinician(text):
    pattern = r"Clinician\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    match = re.search(pattern, text)
    if match:
        return match.group(1).split(",")[0].strip()
    return None


def extract_supervisor(text):
    pattern = r"Supervisor\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    match = re.search(pattern, text)
    if match:
        return match.group(1).split(",")[0].strip()
    return None


def extract_patient(text):
    pattern = r"Patient\s*[:\-]\s*([A-Za-z\s]+)"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else None


def extract_dob(text):
    pattern = r"DOB\s*[:\-]?\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else None


def extract_duration(text):
    pattern = r"Duration\s*[:\-]\s*([0-9]+ ?minutes)"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else None


def extract_service_code(text):
    pattern = r"Service Code\s*[:\-]\s*(\d{5})"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else None


def extract_location(text):
    pattern = r"Location\s*[:\-]\s*([A-Za-z0-9\s\-]+)"
    match = re.search(pattern, text)
    return match.group(1).split("\n")[0].strip() if match else None


def extract_participants(text):
    pattern = r"Participants\s*[:\-]\s*([A-Za-z\s;]+)"
    match = re.search(pattern, text)
    return match.group(1).split("\n")[0].strip() if match else None


def extract_diagnosis(text):
    pattern = r"Diagnosis\s*[:\n]\s*([A-Z]\d{2}(?:\.\d+)?(?: [A-Za-z ,]+)?)"
    match = re.search(pattern, text, re.MULTILINE)
    return match.group(1).strip() if match else None


def extract_session_info(text):
    return {
        "Clinician": extract_clinician(text),
        "Supervisor": extract_supervisor(text),
        "Patient": extract_patient(text),
        "DOB": extract_dob(text),
        "Date": extract_dos(text),
        "Duration": extract_duration(text),
        "Service Code": extract_service_code(text),
        "Location": extract_location(text),
        "Participants": extract_participants(text),
        "Diagnosis": extract_diagnosis(text),
    }
