In [4]:
import PyPDF2
from pathlib import Path

In [3]:
def extract_text_from_pdf(file_path):
    reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [70]:
text = []

for pdf_file in Path("data").rglob("*.pdf"):
    text.append(extract_text_from_pdf(pdf_file))

In [13]:
text[2]

'Diagnosis\nF43.24 Adjustment Disorder , With disturbance of conduct\nOliver ’s emotional and behavioral responses,including irritability , frustration, and outbursts, appear to be maladaptive reactions to  \nstressors, resulting in significant impairment in social and academic functioning. His conduct disturbances (e.g., emotional outbursts,  \ndifficulty following expectations, and oppositional behaviors) are not better explained by another mental disorder and have emerged in  \nthe context of adjustment to new environmental demands. Further assessment is needed to determine if the symptoms persist beyond  \nthe expected period of adjustment or if another primary diagnosis is more appropriate.\nCurrent Mental Status\nOrientation :X3: Oriented to Person, Place, and\nTime\nGeneral Appearance :Appropriate\nDress :Appropriate\nMotor Activity :Unremarkable\nInterview Behavior :Appropriate\nSpeech :Normal\nMood :Irritable\nAffect:AppropriateInsight :Good\nJudgment/Impulse Control :Good\nMe

In [None]:
import re


def extract_session_info(text):
    # Patterns
    clinician_pattern = r"Clinician\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    supervisor_pattern = r"Supervisor\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    patient_pattern = r"Patient\s*[:\-]\s*([A-Za-z\s]+)"
    dob_pattern = r"DOB\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})"
    date_pattern = r"Date and T(?:ime|ime)\s*[:\-]\s*([0-9/]+ [0-9:apm\s\-–]+)"
    duration_pattern = r"Duration\s*[:\-]\s*([0-9]+ ?minutes)"
    service_code_pattern = (
        r"Service Code\s*[:\-]\s*(\d{5})"  # CPT codes are 5-digit numeric
    )
    location_pattern = r"Location\s*[:\-]\s*([A-Za-z0-9\s\-]+)"
    participants_pattern = r"Participants\s*[:\-]\s*([A-Za-z\s;]+)"
    diagnosis_pattern = r"Diagnosis\s*[:\n]\s*([A-Z]\d{2}(?:\.\d+)?(?: [A-Za-z ,]+)?)"

    # Extract using regex
    clinician = re.search(clinician_pattern, text).split(',')[0]
    supervisor = re.search(supervisor_pattern, text).split(",")[0]
    patient = re.search(patient_pattern, text)
    dob = re.search(dob_pattern, text)
    date = re.search(date_pattern, text)
    duration = re.search(duration_pattern, text)
    service_code = re.search(service_code_pattern, text)
    location = re.search(location_pattern, text).split('/n')[0]
    participants = re.search(participants_pattern, text).split('/n')[0]
    diagnosis = re.search(diagnosis_pattern, text)

    result = {
        "Clinician": clinician.group(1).strip() if clinician else None,
        "Supervisor": supervisor.group(1).strip() if supervisor else None,
        "Patient": patient.group(1).strip() if patient else None,
        "DOB": dob.group(1).strip() if dob else None,
        "Date": date.group(1).strip() if date else None,
        "Duration": duration.group(1).strip() if duration else None,
        "Service Code": service_code.group(1).strip() if service_code else None,
        "Location": location.group(1).strip() if location else None,
        "Participants": participants.group(1).strip() if participants else None,
        "Diagnosis": diagnosis.group(1).strip() if diagnosis else None,
    }

    return result


In [92]:
import re


def extract_dos(text):
    phrase = "Date and"
    pattern = rf"{re.escape(phrase)}[^\d]*(\d{{1,2}}/\d{{1,2}}/\d{{4}})"
    matches = re.findall(pattern, text)
    return list(dict.fromkeys(matches))[0] if matches else None


def extract_session_info(text):
    # Patterns
    clinician_pattern = r"Clinician\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    supervisor_pattern = r"Supervisor\s*[:\-]\s*([A-Za-z\s]+(?:, [A-Za-z\s]+)*)"
    patient_pattern = r"Patient\s*[:\-]\s*([A-Za-z\s]+)"
    dob_pattern = r"DOB\s*[:\-]?\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})"
    duration_pattern = r"Duration\s*[:\-]\s*([0-9]+ ?minutes)"
    service_code_pattern = r"Service Code\s*[:\-]\s*(\d{5})"
    location_pattern = r"Location\s*[:\-]\s*([A-Za-z0-9\s\-]+)"
    participants_pattern = r"Participants\s*[:\-]\s*([A-Za-z\s;]+)"
    diagnosis_pattern = r"Diagnosis\s*[:\n]\s*([A-Z]\d{2}(?:\.\d+)?(?: [A-Za-z ,]+)?)"

    # Extract using regex
    clinician_match = re.search(clinician_pattern, text)
    supervisor_match = re.search(supervisor_pattern, text)
    patient_match = re.search(patient_pattern, text)
    dob_match = re.search(dob_pattern, text)
    date_match = extract_dos(text)
    duration_match = re.search(duration_pattern, text)
    service_code_match = re.search(service_code_pattern, text)
    location_match = re.search(location_pattern, text)
    participants_match = re.search(participants_pattern, text)
    diagnosis_match = re.search(diagnosis_pattern, text, re.MULTILINE)

    # Split and strip first item if multiple names
    clinician = (
        clinician_match.group(1).split(",")[0].strip() if clinician_match else None
    )
    supervisor = (
        supervisor_match.group(1).split(",")[0].strip() if supervisor_match else None
    )
    location = (
        location_match.group(1).split("\n")[0].strip() if location_match else None
    )
    participants = (
        participants_match.group(1).split("\n")[0].strip()
        if participants_match
        else None
    )

    result = {
        "Clinician": clinician,
        "Supervisor": supervisor,
        "Patient": patient_match.group(1).strip() if patient_match else None,
        "DOB": dob_match.group(1).strip() if dob_match else None,
        "Date": date_match if date_match else None,
        "Duration": duration_match.group(1).strip() if duration_match else None,
        "Service Code": service_code_match.group(1).strip()
        if service_code_match
        else None,
        "Location": location,
        "Participants": participants,
        "Diagnosis": diagnosis_match.group(1).strip() if diagnosis_match else None,
    }

    return result


In [93]:
info = []
for t in text:
    info.append(extract_session_info(t))

In [94]:
info[3]

{'Clinician': 'Noelle W ade',
 'Supervisor': 'Rebecca Gehlke Nolan',
 'Patient': 'Shelly Renee Marshall',
 'DOB': '10/21/1990',
 'Date': '12/1/2025',
 'Duration': '80 minutes',
 'Service Code': '90837',
 'Location': 'In person',
 'Participants': 'Client Only',
 'Diagnosis': 'F32.0 Major Depressive Disorder , Recurrent episode, Mild'}

In [59]:
text[1]

"Diagnosis\nF43.23 Adjustment Disorder , With mixed anxiety and depressed mood\nThe client is experiencing significant emotional and behavioral distress related to the transition into marriage, which occurred about  \none year ago. He reports increased arguments, breakdowns in communication, and trust issues with his partner . These symptoms  \nhave intensified since the wedding and reflect dif ficulty adjusting to new relational dynamics, impacting his emotional well-being and  \nrelationship functioning\nMedications\nNone disclosed by partner.\nSubjective Report and Symptom Description\nThe client's partner attended the session alone to discuss her own experiences and concerns. She reported that the past week has been\nrough due to a back injury sustained while working out, resulting in ongoing pain and frustration about being unable to exercise. This has\nled to increased negat ive self-talk and stress about her weight and body image. She described a longstanding history of body ima