In [1]:
import re
import os
import json
import pandas as pd
from typing import List

from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field

load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
DIR = "SOAP_notes"

In [3]:
def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    full_text = "\n".join([doc.page_content for doc in documents])
    return full_text

In [4]:
text_content = load_pdf(f"{DIR}/68750.pdf")

In [5]:
def deidentify_text(text):
    removed = {}

    patterns = {
        "patient_name": r"(Patient:|^)[ \t]*([A-Z]+, [A-Za-z]+)",
        "dob": r"DOB: ?(\d{2}/\d{2}/\d{4})",
        "account_number": r"Acc(?:ount)? No\.?[: ]*(\d+)",
        "provider_name": r"Provider: ([A-Za-z .,'-]+, MD)",
        "phone": r"Phone: ?([\d-]{10,})",
        "address": r"Address: ([\w\d ,.-]+TX-\d{5})",
        "fax": r"Fax: ?([\d-]{10,})",
        "signed_by": r"Electronically signed by ([A-Za-z .,'-]+) on",
        "signature_date": r"on (\d{2}/\d{2}/\d{4}) at",
        "dos": r"DOS: ?(\d{2}/\d{2}/\d{4})",
    }

    for key, pattern in patterns.items():
        matches = re.findall(pattern, text, re.MULTILINE)
        if matches:
            if isinstance(matches[0], tuple):
                matches = [m[-1] for m in matches]
            removed[key] = matches
            text = re.sub(pattern, lambda m: m.group(0).replace(m.group(1), ""), text)

    name_pattern = r"\b([A-Z]+, [A-Za-z]+)\b"
    names = re.findall(name_pattern, text)
    if names:
        removed.setdefault("names", []).extend(names)
        text = re.sub(name_pattern, "", text)

    date_pattern = r"\b(\d{2}/\d{2}/\d{4})\b"
    dates = re.findall(date_pattern, text)
    if dates:
        removed.setdefault("dates", []).extend(dates)
        text = re.sub(date_pattern, "", text)

    # Remove any remaining phone numbers
    phone_pattern = r"\b\d{3}[-.]\d{3}[-.]\d{4}\b"
    phones = re.findall(phone_pattern, text)
    if phones:
        removed.setdefault("phones", []).extend(phones)
        text = re.sub(phone_pattern, "", text)

    return text, removed


In [6]:
deidentified_text, removed_dict = deidentify_text(text_content)

In [7]:
print("De-identified text:\n", deidentified_text)
print("\nRemoved PHI:\n", removed_dict)

De-identified text:
  DOB:  (17 yo M) Acc No.  DOS: 
 
 
Account Number: 28536 Provider: 
DOB:    Age: 17 Y   Sex: Male Date: 
Phone: 
Address: 
Subjective:
Chief Complaints:
   1. Well & Sick: HURT SHOULDER left side, 3 weeks ago while boxing. 2. Well Child Examination - EPSDT - 15 to
17 years (Male):NM. 3. Immunization follow-up.
HPI:
   Interval History: 
       Lives with: parents . 
       Family support: yes, partner involved with care . 
       Primary care giver: mother . 
       Interim Illness: none . 
       Accidents: none . 
       Sleep: sleeps through the night , ( ) hours per night , ( ) hours nap time during the day , no problems
reported . 
       Sees/Hears: well - as reported by parent , eyes straight always . 
       Early childhood intervention programs: no . 
       Vaccine reactions: none . 
       Emergency room visits: none . 
       Home remedies: none . 
       Review previous/interim laboratory studies: all laboratory results within normal limits , normal l

In [8]:
def extract_icd10_codes(text):
    icd_pattern = r"([A-Z][0-9][0-9A-Z]\.[0-9A-Z]+|[A-Z][0-9][0-9A-Z]+)"  # e.g., Z00.129, Z68.52, Z23
    result = {}
    for line in text.splitlines():
        if line.strip().lower().startswith("plan"):
            break
        match = re.search(icd_pattern, line)
        if match:
            code = match.group(0)
            desc = line.split(code)[0].strip(" .-:")
            result[code] = desc
    return result

In [9]:
icd_dict = extract_icd10_codes(deidentified_text)
print(icd_dict)

{'Z00.129': '1. Encounter for well child visit at 17 years of age', 'Z68.52': '2. BMI,pediatric 5% - <85%', 'Z71.82': '3. Exercise counseling', 'Z71.3': '4. Dietary counseling and surveillance', 'Z23': '5. Encounter for immunization', 'M25.512': '6. Left shoulder pain, unspecified chronicity'}


In [10]:
def extract_procedure_codes(note_text: str) -> dict:
    results = {}
    
    match = re.search(r"Procedure Codes:\s*(.+)", note_text, re.IGNORECASE | re.DOTALL)
    if not match:
        return results  

    proc_section = match.group(1).split("Units:")[0].strip()

    codes = [c.strip() for c in proc_section.split(",") if c.strip()]

    for code_entry in codes:
        m = re.match(r"([A-Z]?\d{4,5})\s+(.+)", code_entry)
        if m:
            code = m.group(1).strip()
            desc = m.group(2).strip()
            results[code] = desc

    return results

In [11]:
print(extract_procedure_codes(deidentified_text))

{'G8431': 'CLIN DEPRESSION SCREEN DOC', 'G9902': 'Pt scrn tbco and id as user', '96160': 'PT-', '90619': 'MENACWY-TT VACCINE IM', '90460': 'IMADM ANY ROUTE 1ST VAC/TOX'}


In [12]:
llm = ChatOpenAI(model="gpt-4o", temperature=0)

In [13]:
with open("cpt_reference.json", "r") as f:
    cpt_data = json.load(f)

In [14]:
def flatten_cpt(data, parent_keys=[]):
    records = []
    for key, value in data.items():
        if isinstance(value, dict):
            records.extend(flatten_cpt(value, parent_keys + [key]))
        else:
            records.append(
                {
                    "cpt_code": key,
                    "description": value,
                    "category": " > ".join(parent_keys),
                }
            )
    return records

In [15]:
cpt_records = flatten_cpt(cpt_data)

valid_cpts = set()
for record in cpt_records:
    valid_cpts.add(record["cpt_code"])

In [71]:
em_prompt = PromptTemplate(
    input_variables=["note"],
    template="""
You are a certified professional medical coder specializing in Evaluation & Management (E/M) coding. 
Apply the official 2024 AMA/CMS guidelines for office/outpatient visits.

=====================
OUTPUT RULES:
- Output EXACTLY ONE 5-digit CPT E/M code inside double curly braces. Example: {{99213}}
- Do NOT output explanations or extra text.
- If documentation is incomplete or ambiguous, ALWAYS choose the conservative fallback {{99213}}.

=====================
PATIENT TYPE:
- If explicitly states "new patient" → choose from 99202–99205.
- Otherwise assume established patient (99211–99215).

=====================
OPTION 1: TIME-BASED CODING (if duration is clearly documented)
Established patients:
- 10–19 min → {{99212}}
- 20–29 min → {{99213}}
- 30–39 min → {{99214}}
- 40–54 min → {{99215}}

New patients:
- 15–29 min → {{99202}}
- 30–44 min → {{99203}}
- 45–59 min → {{99204}}
- 60–74 min → {{99205}}

=====================
OPTION 2: MDM-BASED CODING (if time not documented)
Use the “2 of 3” rule (Problems, Data, Risk). 
If any element is uncertain or not well supported, downgrade to the lower level.

Established patients:
- {{99211}}: Nurse-only or minimal service, no physician involvement.
- {{99212}}: Straightforward — 1 self-limited problem, no Rx, no workup.
- {{99213}}: Low MDM — 1 stable chronic illness OR 1 acute uncomplicated illness; minimal workup; med refill.
- {{99214}}: Moderate MDM — 2+ stable chronic illnesses, OR 1 chronic with exacerbation, OR acute illness with systemic symptoms; prescription drug management; multiple labs/imaging/referrals documented and clearly related.
- {{99215}}: High MDM — ONLY if documentation shows BOTH:
   (a) severe acute illness or unstable chronic illness with immediate threat to life/organ function
   AND
   (b) intensive monitoring, high-risk Rx/procedure, or urgent hospitalization decision.
   - If these criteria are not explicitly and clearly documented → DOWNGRADE to {{99214}} or {{99213}} based on the conditions mentioned.

New patients (99202–99205): follow same MDM mapping.

=====================
EDGE-CASE RULES:
- If provider ordered tests but results are missing/denied, do NOT escalate; code as if tests were not done.
- If multiple problems are mentioned but only one is evaluated/managed, code based on what was actually managed.
- If documentation is unclear or inconsistent, always downgrade to the next lower supported level.
- Referrals alone do NOT justify 99214. If the provider only referred without new Rx, management, or monitoring → code {{99213}}.
- Negative or “ruled-out” conditions do NOT count as problems managed.
- Do NOT assume complexity from vague phrases (“follow-up needed”) unless specific management is documented.

=====================
EXAMPLES (for shaping only):
- "Stable hypertension, med refill" → {{99213}}
- "2 new issues + new Rx + labs" → {{99214}}
- "Vitals only, no physician eval" → {{99211}}
- "Chest pain, unstable, ER referral" → {{99215}}
- "Test ordered but no results documented" → fallback {{99213}}
- "Referral only, no Rx, no systemic illness" → {{99213}}
- "Stable chronic illness, follow-up in 3 months" → {{99213}}

=====================
Encounter Note:
{note}

NOW OUTPUT:
ONLY the single E/M code in double curly braces. Example: {{99214}}
"""
)


In [72]:
class em_output(BaseModel):
    cpt_code : int = Field(..., description="The selected CPT code for the E/M encounter")

def em_coding_llm(note: str):
    chain = em_prompt | llm.with_structured_output(em_output)
    result = chain.invoke({"note" : note})

    return result.cpt_code

In [73]:
result_2 = []

for note in os.listdir(DIR):
    filename = note.split(".")[0]

    if note.lower().endswith(".pdf"):
        print(f"Processing {note}...")
        text, removed = deidentify_text(load_pdf(f"{DIR}/{note}"))
        response = em_coding_llm(text)
        result_2.append({"claim_no": filename, "em_cpt": response})



Processing 68241.pdf...
Processing 68398.pdf...
Processing 68493.pdf...
Processing 68595.pdf...
Processing 68708.pdf...
Processing 68750.pdf...
Processing 68799.pdf...
Processing 68903.pdf...
Processing 68947.pdf...
Processing 69131.pdf...
Processing 69133.pdf...
Processing 69253.pdf...
Processing 69262.pdf...
Processing 69265.pdf...
Processing 69401.pdf...
Processing 69486.pdf...
Processing 69661.pdf...
Processing 69776.pdf...


In [55]:
cpt_prompt = PromptTemplate(
    input_variables=["note", "cpt_reference"],
    template="""
You are a certified professional medical coder. Review the encounter note and assign only those CPT/HCPCS codes that are fully and explicitly supported by documentation.

=====================
CODING RULES:
- Only assign a code if the service is CLEARLY and EXPLICITLY documented.
- Reject codes that are only implied or could be inferred.
- If documentation is ambiguous, do NOT assign the code.
- Use the CPT/HCPCS reference list as the ONLY possible pool of codes.

=====================
SPECIAL SCREENING RULES:
- Depression codes (e.g., G8431) → only if depression screening was performed AND documented as positive with a documented follow-up plan.
- Tobacco use codes (e.g., G9903) → only if patient is explicitly documented as a tobacco user.
- Preventive medicine codes (e.g., 99381–99395) → only if the note clearly documents a preventive or well-child exam, not a problem-focused visit.

=====================
PROCEDURE/ADMIN RULES:
- Nutrition counseling codes (e.g., S9470, 97802–97804) → only if counseling/dietitian services are explicitly documented.
- Administrative/misc codes (e.g., 99051, S9981, S9982) → only if the note clearly documents the corresponding service.
- Do not include ear wax removal (69209, 69210) unless explicitly stated.

=====================
CONFLICT RULES:
- If multiple overlapping codes apply, choose the one most generally accepted across payers.
- When in doubt between a preventive vs. problem-oriented visit, code the problem-oriented visit.

=====================
Encounter Note:
{note}

CPT/HCPCS Reference List:
{cpt_reference}

OUTPUT:
- Return ONLY codes that are explicitly supported.
- Do not add explanations, only the codes.
"""
)


#### Changing the cpt code prompt

In [74]:
cpt_prompt = PromptTemplate(
    input_variables=["note", "cpt_reference"],
    template="""
You are a certified professional medical coder. Review the encounter note and assign only those CPT/HCPCS codes that are fully and explicitly supported by documentation.

=====================
GENERAL CODING RULES:
- Only assign a code if the service is CLEARLY and EXPLICITLY documented.
- Reject codes that are only implied or inferred.
- If documentation is ambiguous, do NOT assign the code.
- Use the CPT/HCPCS reference list as the ONLY possible pool of codes.

=====================
SPECIAL SCREENING RULES:
- Depression codes (e.g., G8431) → only if screening was performed AND documented as positive with a follow-up plan. If negative → do NOT code.
- Tobacco use codes (e.g., G9903) → if the Procedure/Assessment section documents tobacco user, INCLUDE it even if history/social states “non-smoker.” Procedure documentation takes precedence.
- Preventive medicine codes (99381–99395) → assign if a well-child exam or preventive visit is documented.
- If both a preventive exam AND a problem-oriented service are documented, code BOTH.

=====================
PROCEDURE/ADMIN RULES:
- Nutrition counseling (S9470, 97802–97804) → only if counseling/dietitian service explicitly documented.
- Administrative/misc (99051, S9981, S9982) → only if explicitly documented.
- Ear wax removal (69209, 69210) → only if explicitly documented.

=====================
CONFLICT & EDGE RULES:
- If multiple overlapping codes apply, choose the one most generally accepted across payers.
- If preventive vs problem-oriented visit overlap, code BOTH if both are supported.
- If provider only referred (e.g., ER/ED referral) with no Rx or management → do NOT escalate problem-oriented code.
- If tests ordered but results missing/denied → code as if not performed.
- Negative or “ruled-out” conditions do NOT count as problems managed.

=====================
Encounter Note:
{note}

CPT/HCPCS Reference List:
{cpt_reference}

OUTPUT:
- Return ONLY the codes, separated by commas.
- Do not add explanations.
"""
)


In [75]:
class CptOutput(BaseModel):
    cpt_list: List[str] = Field(
        ..., description="List of CPT codes supported by encounter note"
    )

def cpt_coding_llm(note: str, cpt_reference: List[str]):
    chain = cpt_prompt | llm.with_structured_output(CptOutput)
    result = chain.invoke({"note": note, "cpt_reference": cpt_reference})

    return result.cpt_list

In [76]:
def pipeline(filepath: str, valid_cpts=valid_cpts):
    text_content = load_pdf(filepath)
    deidentified_text, removed = deidentify_text(text_content)
    
    response = cpt_coding_llm(deidentified_text, cpt_data)
    filtered = [c for c in response if c in valid_cpts]

    em_code = em_coding_llm(text_content)
    filtered.append(em_code)
    
    return filtered

In [77]:
results = []

for note in os.listdir(DIR):
    filename = note.split(".")[0]
    
    if note.lower().endswith(".pdf"):
        result = pipeline(
            os.path.join(DIR, note),
        )
        results.append({"file": filename, "CPT list": result})


In [None]:
results

In [91]:
df = pd.DataFrame(results)

actual_cpts = pd.read_excel("Required Solution.xlsx", usecols=["Claim No", "CPT Code"])
actual_grouped = actual_cpts.groupby("Claim No")["CPT Code"].apply(list).reset_index()
actual_grouped.rename(columns={"CPT Code": "Actual CPT list"}, inplace=True)

df["file"] = df["file"].astype(int)
merged = pd.merge(df, actual_grouped, left_on="file", right_on="Claim No", how="inner")


def compare_cpts(row):
    predicted = set(map(str, row["CPT list"]))
    actual = set(map(str, row["Actual CPT list"]))
    missing = list(actual - predicted)
    same = list(actual & predicted)
    additional = list(predicted - actual)
    return pd.Series(
        [same, missing, additional], index=["Same", "Missing", "Additional"]
    )


merged[["Same", "Missing", "Additional"]] = merged.apply(compare_cpts, axis=1)


merged[["Claim No", "Same", "Missing", "Additional"]]

final_df = pd.DataFrame()
final_df = pd.concat([final_df, merged], axis=0, ignore_index=True)

In [92]:
final_df

Unnamed: 0,file,CPT list,Claim No,Actual CPT list,Same,Missing,Additional
0,68241,[99214],68241,[99214],[99214],[],[]
1,68398,"[99393, 90619, 90460, 90715, 90460, 90461, 99213]",68398,"[90460, 90460, 90461, 90619, 90715, 99393]","[90715, 90619, 99393, 90461, 90460]",[],[99213]
2,68493,"[99394, 96160, 99213]",68493,"[96160, G9903, 99394]","[96160, 99394]",[G9903],[99213]
3,68595,"[99051, 99214]",68595,"[99051, 99213]",[99051],[99213],[99214]
4,68708,"[87807, 99213]",68708,"[87807, 99213]","[99213, 87807]",[],[]
5,68750,"[99394, 90619, 90460, 96160, 99213]",68750,"[90460, 90619, 96160, 99213, 99394]","[96160, 99213, 90619, 99394, 90460]",[],[]
6,68799,"[90619, 90460, 90715, 90460, 90461, 90620, 904...",68799,"[90460, 90460, 90461, 90619, 90620, 90651, 907...","[90620, 90715, 90619, 90461, 90651, 99211, 90460]",[],[99394]
7,68903,"[99394, 96160, 99213]",68903,"[96160, G9903, 99213, 99384]","[96160, 99213]","[99384, G9903]",[99394]
8,68947,"[99381, 90460, 90461, 90700, 90713, 90648, 906...",68947,"[90460, 90460, 90461, 90648, 90671, 90680, 907...","[90680, 99213, 90671, 90461, 90713, 90744, 904...","[Cards, 99391]",[99381]
9,69131,"[81002, 99214]",69131,"[81002, 99214]","[81002, 99214]",[],[]
