In [1]:
import re
from typing import List

from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel, Field

load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
DIR = "SOAP_notes"

In [3]:
def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    
    full_text = "\n".join([doc.page_content for doc in documents])
    return full_text


text_content = load_pdf(f"{DIR}/68750.pdf")
print(text_content)

PULLEN, Caedyn DOB: 07/05/2007 (17 yo M) Acc No. 28536 DOS: 06/30/2025
 
Patient: PULLEN, Caedyn
Account Number: 28536 Provider: Rahman Uddin, MD
DOB: 07/05/2007   Age: 17 Y   Sex: Male Date: 06/30/2025
Phone: 832-893-3970
Address: 3825 YOUPON DR, LA PORTE, TX-77571
Subjective:
Chief Complaints:
   1. Well & Sick: HURT SHOULDER left side, 3 weeks ago while boxing. 2. Well Child Examination - EPSDT - 15 to
17 years (Male):NM. 3. Immunization follow-up.
HPI:
   Interval History: 
       Lives with: parents . 
       Family support: yes, partner involved with care . 
       Primary care giver: mother . 
       Interim Illness: none . 
       Accidents: none . 
       Sleep: sleeps through the night , ( ) hours per night , ( ) hours nap time during the day , no problems
reported . 
       Sees/Hears: well - as reported by parent , eyes straight always . 
       Early childhood intervention programs: no . 
       Vaccine reactions: none . 
       Emergency room visits: none . 
       Home r

In [4]:
def deidentify_text(text):
    removed = {}

    # Patterns for PHI
    patterns = {
        "patient_name": r"(Patient:|^)[ \t]*([A-Z]+, [A-Za-z]+)",
        "dob": r"DOB: ?(\d{2}/\d{2}/\d{4})",
        "account_number": r"Acc(?:ount)? No\.?[: ]*(\d+)",
        "provider_name": r"Provider: ([A-Za-z .,'-]+, MD)",
        "phone": r"Phone: ?([\d-]{10,})",
        "address": r"Address: ([\w\d ,.-]+TX-\d{5})",
        "fax": r"Fax: ?([\d-]{10,})",
        "signed_by": r"Electronically signed by ([A-Za-z .,'-]+) on",
        "signature_date": r"on (\d{2}/\d{2}/\d{4}) at",
        "dos": r"DOS: ?(\d{2}/\d{2}/\d{4})",
    }

    # Remove and collect PHI
    for key, pattern in patterns.items():
        matches = re.findall(pattern, text, re.MULTILINE)
        if matches:
            # If the match is a tuple, get the last group
            if isinstance(matches[0], tuple):
                matches = [m[-1] for m in matches]
            removed[key] = matches
            text = re.sub(pattern, lambda m: m.group(0).replace(m.group(1), ""), text)

    # Remove any remaining names (doctor or patient) in the format: LAST, First
    name_pattern = r"\b([A-Z]+, [A-Za-z]+)\b"
    names = re.findall(name_pattern, text)
    if names:
        removed.setdefault("names", []).extend(names)
        text = re.sub(name_pattern, "", text)

    # Remove any remaining dates in MM/DD/YYYY format
    date_pattern = r"\b(\d{2}/\d{2}/\d{4})\b"
    dates = re.findall(date_pattern, text)
    if dates:
        removed.setdefault("dates", []).extend(dates)
        text = re.sub(date_pattern, "", text)

    # Remove any remaining phone numbers
    phone_pattern = r"\b\d{3}[-.]\d{3}[-.]\d{4}\b"
    phones = re.findall(phone_pattern, text)
    if phones:
        removed.setdefault("phones", []).extend(phones)
        text = re.sub(phone_pattern, "", text)


    return text, removed


In [5]:
deidentified_text, removed_dict = deidentify_text(text_content)
print("De-identified text:\n", deidentified_text)
print("\nRemoved PHI:\n", removed_dict)

De-identified text:
  DOB:  (17 yo M) Acc No.  DOS: 
 
 
Account Number: 28536 Provider: 
DOB:    Age: 17 Y   Sex: Male Date: 
Phone: 
Address: 
Subjective:
Chief Complaints:
   1. Well & Sick: HURT SHOULDER left side, 3 weeks ago while boxing. 2. Well Child Examination - EPSDT - 15 to
17 years (Male):NM. 3. Immunization follow-up.
HPI:
   Interval History: 
       Lives with: parents . 
       Family support: yes, partner involved with care . 
       Primary care giver: mother . 
       Interim Illness: none . 
       Accidents: none . 
       Sleep: sleeps through the night , ( ) hours per night , ( ) hours nap time during the day , no problems
reported . 
       Sees/Hears: well - as reported by parent , eyes straight always . 
       Early childhood intervention programs: no . 
       Vaccine reactions: none . 
       Emergency room visits: none . 
       Home remedies: none . 
       Review previous/interim laboratory studies: all laboratory results within normal limits , normal l

In [6]:
def extract_icd10_codes(text):
    """
    Extracts ICD10 codes and their descriptions from a text section.
    Returns a dict: {ICD10_code: description}
    """
    icd_pattern = r"([A-Z][0-9][0-9A-Z]\.[0-9A-Z]+|[A-Z][0-9][0-9A-Z]+)"  # e.g., Z00.129, Z68.52, Z23
    result = {}
    for line in text.splitlines():
        if line.strip().lower().startswith("plan"):
            break
        match = re.search(icd_pattern, line)
        if match:
            code = match.group(0)
            # Description is everything before the code
            desc = line.split(code)[0].strip(" .-:")
            result[code] = desc
    return result


icd_dict = extract_icd10_codes(deidentified_text)
print(icd_dict)

{'Z00.129': '1. Encounter for well child visit at 17 years of age', 'Z68.52': '2. BMI,pediatric 5% - <85%', 'Z71.82': '3. Exercise counseling', 'Z71.3': '4. Dietary counseling and surveillance', 'Z23': '5. Encounter for immunization', 'M25.512': '6. Left shoulder pain, unspecified chronicity'}


In [7]:
def extract_procedure_codes(note_text: str) -> dict:
    """
    Extracts CPT procedure codes and their descriptions
    from an encounter note section starting with 'Procedure Codes:'.
    Returns a dict {CPT: Description}.
    """
    results = {}

    # Find the "Procedure Codes:" section
    match = re.search(r"Procedure Codes:\s*(.+)", note_text, re.IGNORECASE | re.DOTALL)
    if not match:
        return results  # No procedure codes found

    # Get everything after "Procedure Codes:" until next header (like "Units:" or newline block)
    proc_section = match.group(1).split("Units:")[0].strip()

    # Split on commas
    codes = [c.strip() for c in proc_section.split(",") if c.strip()]

    for code_entry in codes:
        # Match CPT code at start (numbers or alphanumeric like G8431, J0696 etc.)
        m = re.match(r"([A-Z]?\d{4,5})\s+(.+)", code_entry)
        if m:
            code = m.group(1).strip()
            desc = m.group(2).strip()
            results[code] = desc

    return results



print(extract_procedure_codes(deidentified_text))


{'G8431': 'CLIN DEPRESSION SCREEN DOC', 'G9902': 'Pt scrn tbco and id as user', '96160': 'PT-', '90619': 'MENACWY-TT VACCINE IM', '90460': 'IMADM ANY ROUTE 1ST VAC/TOX'}


In [8]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")


In [58]:
candidate_prompt = PromptTemplate(
    input_variables= ["note"],
    template= """
    You are an expert medical coding assistant
    Your job is to carefully read the following clinical note and identify all the sevices that might require CPT coding.input_types=
    
    Clinical Note: 
    {note}
    
    Instructions:
    1. List each service in plain English(not CPT codes yet).
    2. Be concise but complete.
    3. Ignore demographic information, insurance details, or administrative text.
    4. Output format must be a list of strings.
    Example output:
    [
    "Preventive well child exam (age 12)",
    "Immunization administration with counseling",
    "Meningococcal vaccine given",
    "Depression screening (PHQ-9)",
    "Problem-oriented visit for shoulder pain"
    ]
    """
)

In [None]:
class candidate_output(BaseModel):
    candidate_sentences: List[str] = Field(..., description="List of candidate sentences")

In [63]:
structred_llm = llm.with_structured_output(candidate_output)

In [64]:
chain = candidate_prompt | structred_llm

In [None]:
services = chain.invoke({"note": deidentified_text})

In [85]:
candidate_sentences = services.candidate_sentences
candidate_sentences

['Preventive well child exam (age 17)',
 'Immunization administration (MenQuadFI)',
 'Counseling on immunization risks and benefits',
 'Depression screening (PHQ-9)',
 'Problem-focused visit for left shoulder pain',
 'Prescription of Naproxen for shoulder pain',
 'Exercise counseling',
 'Dietary counseling',
 'Referral to orthopedics']

In [26]:
import json
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS

with open("cpt_reference.json", "r") as f:
    cpt_data = json.load(f)
    
def flatten_cpt(data, parent_keys=[]):
    records = []
    for key, value in data.items():
        if isinstance(value, dict):
            records.extend(flatten_cpt(value, parent_keys + [key]))
        else:
            records.append({
                "cpt_code": key,
                "description": value, 
                "category": " > ".join(parent_keys)
            })
    return records


In [74]:
cpt_records = flatten_cpt(cpt_data)

docs = [
    Document(
        page_content=f"{rec['cpt_code']}: {rec['description']}", 
        metadata = {"code": rec["cpt_code"], "category":rec["category"]}
    )
    for rec in cpt_records
]


In [76]:
embedder = SentenceTransformer("abhinand/MedEmbed-large-v0.1")

def embed_func(texts):
    if isinstance(texts, str):
        texts = [texts]
    return embedder.encode(texts, convert_to_numpy=True).tolist()

In [81]:
from langchain_core.embeddings import Embeddings
from sentence_transformers import SentenceTransformer

# Custom wrapper
class HFEmbeddings(Embeddings):
    def __init__(self, model_name="abhinand/MedEmbed-large-v0.1"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        return self.model.encode([text], convert_to_numpy=True)[0].tolist()

# Use it
hf_embeddings = HFEmbeddings()


In [83]:
vectorstore = FAISS.from_documents(docs, embedding=hf_embeddings)
vectorstore.save_local("cpt_faiss_index")

In [86]:
vectorstore = FAISS.load_local(
    "cpt_faiss_index",
    embeddings=hf_embeddings,
    allow_dangerous_deserialization=True
)

In [87]:
results_dict = {}

for candidate in candidate_sentences:
    hits = vectorstore.similarity_search(candidate, k=5)
    results_dict[candidate] = [
        {
            "cpt_code": hit.metadata.get("code"),
            "description": hit.page_content,
        }
        for hit in hits
    ]
    

print(json.dumps(results_dict, indent=2))

{
  "Preventive well child exam (age 17)": [
    {
      "cpt_code": "99384",
      "description": "99384: Preventive Care New Pt. Age 12-17"
    },
    {
      "cpt_code": "99394",
      "description": "99394: Preventive Care Est. Pt. Age 12-17"
    },
    {
      "cpt_code": "99382",
      "description": "99382: Preventive Care New Pt. Age 1-4"
    },
    {
      "cpt_code": "99383",
      "description": "99383: Preventive Care New Pt. Age 5-11"
    },
    {
      "cpt_code": "99393",
      "description": "99393: Preventive Care Est. Pt. Age 5-11"
    }
  ],
  "Immunization administration (MenQuadFI)": [
    {
      "cpt_code": "90472",
      "description": "90472: Immunization admin, each additional"
    },
    {
      "cpt_code": "90461",
      "description": "90461: Admin, each additional vaccine/toxoid, with counseling"
    },
    {
      "cpt_code": "90471",
      "description": "90471: Immunization admin, 1st vaccine"
    },
    {
      "cpt_code": "90460",
      "description":

In [89]:
llm_prompt = """
You are a certified medical coder.
You will be given a patient’s clinical note, a list of candidate services, and possible CPT codes (retrieved by similarity search).

Task:
- Select the **final set of CPT codes** that should be billed for this encounter.
- Only pick codes from the retrieved options.
- Remove duplicates.
- If a service does not require a CPT code, ignore it.
- Return the result as list of CPT codes
[
CPT codes
]

Patient Note:
---
{note}
---

Candidates with Retrieved CPTs:
{retrieved}
"""

retrieved_str = ""
for cand, hits in results_dict.items():
    retrieved_str += f"\nCandidate: {cand}\nOptions:\n"
    for h in hits:
        retrieved_str += f"- {h['cpt_code']}: {h['description']}\n"


final_prompt = llm_prompt.format(note=deidentified_text, retrieved=retrieved_str)
response = llm.invoke(final_prompt)

response.content


"[\n'99394',\n'90460',\n'G8431',\n'99202',\n'S9470'\n]"

In [90]:
print(response.content)

[
'99394',
'90460',
'G8431',
'99202',
'S9470'
]


In [9]:
deidentified_text

' DOB:  (17 yo M) Acc No.  DOS: \n \n \nAccount Number: 28536 Provider: \nDOB:    Age: 17 Y   Sex: Male Date: \nPhone: \nAddress: \nSubjective:\nChief Complaints:\n   1. Well & Sick: HURT SHOULDER left side, 3 weeks ago while boxing. 2. Well Child Examination - EPSDT - 15 to\n17 years (Male):NM. 3. Immunization follow-up.\nHPI:\n   Interval History: \n       Lives with: parents . \n       Family support: yes, partner involved with care . \n       Primary care giver: mother . \n       Interim Illness: none . \n       Accidents: none . \n       Sleep: sleeps through the night , ( ) hours per night , ( ) hours nap time during the day , no problems\nreported . \n       Sees/Hears: well - as reported by parent , eyes straight always . \n       Early childhood intervention programs: no . \n       Vaccine reactions: none . \n       Emergency room visits: none . \n       Home remedies: none . \n       Review previous/interim laboratory studies: all laboratory results within normal limits , nor

In [23]:
class cpt_output(BaseModel):
    cpt_list : List[str] = Field(..., description="List of CPT codes supported by encounter note")

In [29]:
cpt_structured_llm = llm.with_structured_output(cpt_output)

prompt = PromptTemplate(
    input_variables=["note"], 
    template= """
You are a certified medical coder. Given this encounter note:
- Identify preventive visit codes
- Identify office visit codes
- Identify problem-oriented visit codes
- Identify immunizations (vaccine + admin codes)
- Identify screenings/assessments
- Any other CPT which is directly supported by the documentation
- Only output CPT/HCPCS codes that are fully supported by documentation

Encounter Note:
{note}

ALL CPT codes:
{cpt_reference}
"""
)
chain = prompt | cpt_structured_llm

response = chain.invoke({"note":deidentified_text, "cpt_reference":cpt_data})

print(response)

cpt_list=['99394', '90619', '90460', '96160', 'G8431', 'G9902']
