In [29]:
from langchain_community.document_loaders import PyPDFLoader
import re
from sentence_transformers import SentenceTransformer
from typing import List
import numpy as np
import json

In [9]:
DIR = "SOAP_notes"

In [13]:
def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    
    full_text = "\n".join([doc.page_content for doc in documents])
    return full_text


text_content = load_pdf(f"{DIR}/68750.pdf")
print(text_content)

PULLEN, Caedyn DOB: 07/05/2007 (17 yo M) Acc No. 28536 DOS: 06/30/2025
 
Patient: PULLEN, Caedyn
Account Number: 28536 Provider: Rahman Uddin, MD
DOB: 07/05/2007   Age: 17 Y   Sex: Male Date: 06/30/2025
Phone: 832-893-3970
Address: 3825 YOUPON DR, LA PORTE, TX-77571
Subjective:
Chief Complaints:
   1. Well & Sick: HURT SHOULDER left side, 3 weeks ago while boxing. 2. Well Child Examination - EPSDT - 15 to
17 years (Male):NM. 3. Immunization follow-up.
HPI:
   Interval History: 
       Lives with: parents . 
       Family support: yes, partner involved with care . 
       Primary care giver: mother . 
       Interim Illness: none . 
       Accidents: none . 
       Sleep: sleeps through the night , ( ) hours per night , ( ) hours nap time during the day , no problems
reported . 
       Sees/Hears: well - as reported by parent , eyes straight always . 
       Early childhood intervention programs: no . 
       Vaccine reactions: none . 
       Emergency room visits: none . 
       Home r

In [21]:
def deidentify_text(text):
    removed = {}

    # Patterns for PHI
    patterns = {
        "patient_name": r"(Patient:|^)[ \t]*([A-Z]+, [A-Za-z]+)",
        "dob": r"DOB: ?(\d{2}/\d{2}/\d{4})",
        "account_number": r"Acc(?:ount)? No\.?[: ]*(\d+)",
        "provider_name": r"Provider: ([A-Za-z .,'-]+, MD)",
        "phone": r"Phone: ?([\d-]{10,})",
        "address": r"Address: ([\w\d ,.-]+TX-\d{5})",
        "fax": r"Fax: ?([\d-]{10,})",
        "signed_by": r"Electronically signed by ([A-Za-z .,'-]+) on",
        "signature_date": r"on (\d{2}/\d{2}/\d{4}) at",
        "dos": r"DOS: ?(\d{2}/\d{2}/\d{4})",
    }

    # Remove and collect PHI
    for key, pattern in patterns.items():
        matches = re.findall(pattern, text, re.MULTILINE)
        if matches:
            # If the match is a tuple, get the last group
            if isinstance(matches[0], tuple):
                matches = [m[-1] for m in matches]
            removed[key] = matches
            text = re.sub(pattern, lambda m: m.group(0).replace(m.group(1), ""), text)

    # Remove any remaining names (doctor or patient) in the format: LAST, First
    name_pattern = r"\b([A-Z]+, [A-Za-z]+)\b"
    names = re.findall(name_pattern, text)
    if names:
        removed.setdefault("names", []).extend(names)
        text = re.sub(name_pattern, "", text)

    # Remove any remaining dates in MM/DD/YYYY format
    date_pattern = r"\b(\d{2}/\d{2}/\d{4})\b"
    dates = re.findall(date_pattern, text)
    if dates:
        removed.setdefault("dates", []).extend(dates)
        text = re.sub(date_pattern, "", text)

    # Remove any remaining phone numbers
    phone_pattern = r"\b\d{3}[-.]\d{3}[-.]\d{4}\b"
    phones = re.findall(phone_pattern, text)
    if phones:
        removed.setdefault("phones", []).extend(phones)
        text = re.sub(phone_pattern, "", text)


    return text, removed


In [22]:
deidentified_text, removed_dict = deidentify_text(text_content)
print("De-identified text:\n", deidentified_text)
print("\nRemoved PHI:\n", removed_dict)

De-identified text:
  DOB:  (17 yo M) Acc No.  DOS: 
 
 
Account Number: 28536 Provider: 
DOB:    Age: 17 Y   Sex: Male Date: 
Phone: 
Address: 
Subjective:
Chief Complaints:
   1. Well & Sick: HURT SHOULDER left side, 3 weeks ago while boxing. 2. Well Child Examination - EPSDT - 15 to
17 years (Male):NM. 3. Immunization follow-up.
HPI:
   Interval History: 
       Lives with: parents . 
       Family support: yes, partner involved with care . 
       Primary care giver: mother . 
       Interim Illness: none . 
       Accidents: none . 
       Sleep: sleeps through the night , ( ) hours per night , ( ) hours nap time during the day , no problems
reported . 
       Sees/Hears: well - as reported by parent , eyes straight always . 
       Early childhood intervention programs: no . 
       Vaccine reactions: none . 
       Emergency room visits: none . 
       Home remedies: none . 
       Review previous/interim laboratory studies: all laboratory results within normal limits , normal l