In [5]:
import re
import pandas as pd
import numpy as np
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import SimpleSequentialChain

from pydantic import BaseModel, Field
from typing import List, Annotated, Dict, Any


from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [6]:
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

In [7]:
GOOGLE_API_KEY

'AIzaSyA4jFk1n_6Y36RqytWAMCdJFRSjMCQu-Qo'

In [8]:
llm = ChatGoogleGenerativeAI(
    model = "gemini-1.5-flash",
    google_api_key= GOOGLE_API_KEY
)

In [9]:
def load_pdf(file_path):
  loader = PyPDFLoader(file_path)
  documents = loader.load()

  text_content = "\n".join([doc.page_content for doc in documents])

  return text_content

def deidentify_and_strip(text: str) -> str:
    cleaned_lines = []
    for line in text.splitlines():
        line_stripped = line.strip()

        # Skip empty lines
        if not line_stripped:
            continue

        # Rules to DROP whole lines if they contain PHI
        if re.search(r"(Patient|Clinician|Participants|Supervisor?):", line_stripped, re.IGNORECASE):
            continue
        if re.search(r"DOB\s*[:\-]?\s*\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}", line_stripped):
            continue
        if re.search(r"\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}\b", line_stripped):  # Dates
            continue
        if re.search(r"\b\d{1,2}:\d{2}\s?(?:AM|PM|am|pm|ᴘᴍ|ᴀᴍ)\b", line_stripped):  # Times
            continue
        if re.search(r"Date and Time:", line_stripped, re.IGNORECASE):
            continue
        if re.search(r"(Location|Clinic|Hospital|Center|LLC|LLP|PC)\b", line_stripped):
            continue
        if re.search(r"License\s*[:\-]?\s*[A-Z]*\s*\d+", line_stripped, re.IGNORECASE):
            continue
        if re.search(r"http\S+|www\.\S+", line_stripped):
            continue
        if re.search(r"Page\s+\d+\s+of\s+\d+", line_stripped, re.IGNORECASE):
            continue  # remove "Page 1 of 2" style lines

        # Keep everything else
        cleaned_lines.append(line_stripped)

    return "\n".join(cleaned_lines)


In [10]:
soap_text_1 = load_pdf("soap_note_1.pdf")
deidentify_note_1 = deidentify_and_strip(soap_text_1)

deidentify_note_1

"Diagnosis\nF43.21Adjustment Disorder, With depressed mood\nCurrent Mental Status\nOrientation: X3: Oriented to Person, Place, and Time\nGeneral Appearance: Appropriate\nDress: Appropriate\nMotor Activity: Unremarkable\nInterview Behavior: Appropriate\nSpeech: Normal\nMood: Euthymic\nAffect: Congruent\nInsight: Excellent\nJudgment/Impulse Control: Excellent\nMemory: Intact\nAttention/Concentration: Good\nThought Process: Unremarkable\nThought Content: Appropriate\nPerception: Unremarkable\nFunctional Status: Intact\nRisk Assessment\nPatient denies all areas of risk. No contrary clinical indications present.\nSubjective Report and Symptom Description\nClient noted recent anxiety and tearfulness.\nObjective Content\ndummylink\nProgress Note\nDuration: 60 minutes\nService Code: 90837\nThe sessiont took place face to face in the Minot office. The client's spouse was present upon request.\nInterventions Used\nThe following interventions were used: Cognitive Reframing, Exploration of Emotion

In [11]:
cpt_prediction_prompt = """You are a medical coding assistant.
Assign the correct CPT code(s) from the allowed list below, based on the clinical note.
Do not guess codes that are not in the list.

Allowed CPTs:
- 90791: Psychiatric diagnostic evaluation
- 90832: Psychotherapy, 30 minutes with patient
- 90837: Psychotherapy, 60 minutes with patient
- H0004: Behavioral health counseling and therapy, per 15 minutes
- 96130: Psychological testing evaluation services, first hour
- 96131: Psychological testing evaluation services, each additional hour

Examples:
Note: "Patient presented for initial psychiatric diagnostic interview..." → CPT: 90791
Note: "Session lasted 60 minutes, focused on psychotherapy..." → CPT: 90837
Note: "Behavioral therapy session lasted 15 minutes..." → CPT: H0004

Now classify the following clinical note and return the result in the specified JSON format:
{soap_note}

Return the result in this JSON format:
{{
  "CPT": [ "code1", "code2" ]
}}
"""

In [12]:
class CPT_Output(BaseModel):
    CPT: List[Annotated[str, Field(min_length=5, max_length=5, description="CPT code descrbing the chart note")]]

structured_llm = llm.with_structured_output(CPT_Output)

In [13]:
prompt = PromptTemplate(
    input_variables=["soap_note"],
    template=cpt_prediction_prompt
)

In [14]:
deidentify_note_2 = deidentify_and_strip(load_pdf("soap_note_2.pdf"))
deidentify_note_2

'Presenting Problem\nTaylor presented to the session at the scheduled time in the therapy office. She reported struggling with anxiety in many aspects of\nfunctioning including relationships with daughter, mother, and husband, health, and safety of loved ones. She noted that she\nexperiences "OCD tendencies" like obsessing over made up scenarios that she convinces herself are real, including her mom getting\nmurdered. Also included in experiences with "OCD tendencies" is anxiety about healthy ingredients in products - Taylor noted "using\nthe Yuka app to scan every product to see what was healthy" and it taking her hours in the store. Previous therapy experience was\nreported with focus on anxiety. Struggles include setting boundaries and using health coping tools.\nCurrent Mental Status\nOrientation: X3: Oriented to Person, Place, and TimeGeneral Appearance: AppropriateDress: AppropriateMotor Activity: UnremarkableInterview Behavior: AppropriateSpeech: NormalMood: EuthymicAffect: Cong

In [15]:
soap_note_prompt = prompt.format(soap_note=deidentify_note_2)

response = structured_llm.invoke(soap_note_prompt)

print(response)

CPT=['90791', '90837']


In [16]:
def predict_cpt_code(soap_note: str):
    prompt = PromptTemplate(
        input_variables=["soap_note"],
        template=cpt_prediction_prompt
    )

    soap_note_prompt = prompt.format(soap_note=soap_note)

    response = structured_llm.invoke(soap_note_prompt)

    return response.CPT


In [20]:
cpt_icd_mapping_df = pd.read_excel("Expanded_CPT_to_ICD_mapping.xlsx")
cpt_icd_mapping_df

Unnamed: 0,CPT,CPT Description,ICD-10 Code,ICD-10 Description
0,90832,"Psychotherapy, 30 minutes",F32.0,"Major depressive disorder, single episode, mild"
1,90832,"Psychotherapy, 30 minutes",F32.1,"Major depressive disorder, single episode, mod..."
2,90832,"Psychotherapy, 30 minutes",F32.2,"Major depressive disorder, single episode, sev..."
3,90832,"Psychotherapy, 30 minutes",F32.3,"Major depressive disorder, single episode, sev..."
4,90832,"Psychotherapy, 30 minutes",F32.4,"Major depressive disorder, single episode, in ..."
...,...,...,...,...
237,96131,"Psychological testing, evaluation (additional ...",F81.9,"Developmental disorder of scholastic skills, u..."
238,96131,"Psychological testing, evaluation (additional ...",F84.0,Autistic disorder
239,96131,"Psychological testing, evaluation (additional ...",F84.9,"Pervasive developmental disorder, unspecified"
240,96131,"Psychological testing, evaluation (additional ...",F02.0,Dementia in Alzheimer's disease with early onset


In [21]:
def get_cpt_mapping(df):
  cpt_mapping = {}

  for _, row in cpt_icd_mapping_df.iterrows():
    cpt = str(row["CPT"]).strip()
    cpt_desc = str(row["CPT Description"]).strip()
    icd = str(row["ICD-10 Code"]).strip()
    icd_desc = str(row["ICD-10 Description"]).strip()

    if cpt not in cpt_mapping:
      cpt_mapping[cpt] = {
          "description": cpt_desc,
          "applicable_icds": []
      }

    cpt_mapping[cpt]["applicable_icds"].append({
        icd: icd_desc,
    })

  return cpt_mapping

cpt_mapping = get_cpt_mapping(cpt_icd_mapping_df)


In [22]:
def get_icd_candidates(predicted_cpt_codes: list[str]) -> list[dict]:
    """Return ICD candidates as list of dicts with consistent keys."""
    icd_candidates = []
    for cpt in predicted_cpt_codes:
        if cpt in cpt_mapping:
            for icd_entry in cpt_mapping[cpt]["applicable_icds"]:
                for icd, desc in icd_entry.items():
                    icd_candidates.append({
                        "icd": icd,
                        "description": desc
                    })
    return icd_candidates



icd_candidadtes = get_icd_candidates(["90837", "90832"])

In [23]:
icd_candidadtes

[{'icd': 'F32.0',
  'description': 'Major depressive disorder, single episode, mild'},
 {'icd': 'F32.1',
  'description': 'Major depressive disorder, single episode, moderate'},
 {'icd': 'F32.2',
  'description': 'Major depressive disorder, single episode, severe without psychotic features'},
 {'icd': 'F32.3',
  'description': 'Major depressive disorder, single episode, severe with psychotic features'},
 {'icd': 'F32.4',
  'description': 'Major depressive disorder, single episode, in partial remission'},
 {'icd': 'F32.5',
  'description': 'Major depressive disorder, single episode, in full remission'},
 {'icd': 'F32.9',
  'description': 'Major depressive disorder, single episode, unspecified'},
 {'icd': 'F33.0', 'description': 'Major depressive disorder, recurrent, mild'},
 {'icd': 'F33.1',
  'description': 'Major depressive disorder, recurrent, moderate'},
 {'icd': 'F33.2',
  'description': 'Major depressive disorder, recurrent severe without psychotic features'},
 {'icd': 'F33.3',
  

In [42]:
embedder = SentenceTransformer("abhinand/MedEmbed-large-v0.1")

def embed_texts(texts: List[str]) -> np.ndarray:
    """Returns L2-normalized embeddings (np.ndarray) for stable cosine sim."""
    embs = embedder.encode(
        texts,
        convert_to_numpy=True,
        batch_size=32,
        normalize_embeddings=True,
        show_progress_bar=False
    )
    return embs


In [26]:
def rerank_icd_candidates(note_text: str,
                          icd_candidates: List[Dict[str, str]],
                          top_k: int = 15) -> List[Dict[str, Any]]:
    """
    Re-rank ICD candidates based on semantic similarity to note_text.
    Deduplicates ICD codes by keeping the highest score.
    """
    if not icd_candidates:
        return []

    # Normalize ICD candidates
    normalized_icds = []
    for c in icd_candidates:
        if "icd" in c and "description" in c:
            normalized_icds.append({"icd": c["icd"], "description": c["description"]})
        else:
            icd, desc = list(c.items())[0]
            normalized_icds.append({"icd": icd, "description": desc})

    # Embed note and candidate ICDs
    note_emb = embed_texts([note_text])[0]
    icd_texts = [f'{c["icd"]}: {c["description"]}' for c in normalized_icds]
    icd_embs = embed_texts(icd_texts)

    # Cosine similarity
    sims = note_emb @ icd_embs.T
    idxs = np.argsort(-sims)[: min(top_k, len(normalized_icds))]

    # Collect ranked results
    ranked = [
        {
            "icd": normalized_icds[i]["icd"],
            "description": normalized_icds[i]["description"],
            "score": float(sims[i]),
        }
        for i in idxs
    ]

    # Deduplicate (keep highest score per ICD)
    unique_ranked = {}
    for r in ranked:
        icd = r["icd"]
        if icd not in unique_ranked or r["score"] > unique_ranked[icd]["score"]:
            unique_ranked[icd] = r

    # Return sorted by score again
    return sorted(unique_ranked.values(), key=lambda x: -x["score"])


In [27]:
ranked = rerank_icd_candidates(deidentify_note_1, icd_candidadtes)

ranked

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.77s/it]
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.16s/it]


[{'icd': 'F43.23',
  'description': 'Adjustment disorder with mixed anxiety and depressed mood',
  'score': 0.8104954361915588},
 {'icd': 'F43.21',
  'description': 'Adjustment disorder with depressed mood',
  'score': 0.8060019016265869},
 {'icd': 'F43.22',
  'description': 'Adjustment disorder with anxiety',
  'score': 0.7990717887878418},
 {'icd': 'F43.20',
  'description': 'Adjustment disorder, unspecified',
  'score': 0.7915458679199219},
 {'icd': 'F32.1',
  'description': 'Major depressive disorder, single episode, moderate',
  'score': 0.7650936841964722},
 {'icd': 'F32.4',
  'description': 'Major depressive disorder, single episode, in partial remission',
  'score': 0.7634437084197998},
 {'icd': 'F32.9',
  'description': 'Major depressive disorder, single episode, unspecified',
  'score': 0.7605228424072266},
 {'icd': 'F32.5',
  'description': 'Major depressive disorder, single episode, in full remission',
  'score': 0.756719708442688}]

In [28]:
ranked_2 = rerank_icd_candidates(deidentify_note_2, get_icd_candidates(["90791"]))

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.52s/it]
Batches: 100%|██████████| 2/2 [00:03<00:00,  1.67s/it]


In [29]:
ranked_2

[{'icd': 'F41.9',
  'description': 'Anxiety disorder, unspecified',
  'score': 0.7645387053489685},
 {'icd': 'F41.1',
  'description': 'Generalized anxiety disorder',
  'score': 0.7428685426712036},
 {'icd': 'F43.22',
  'description': 'Adjustment disorder with anxiety',
  'score': 0.7383846640586853},
 {'icd': 'F43.20',
  'description': 'Adjustment disorder, unspecified',
  'score': 0.7294806241989136},
 {'icd': 'F43.10',
  'description': 'Post-traumatic stress disorder, unspecified',
  'score': 0.7287815809249878},
 {'icd': 'F43.23',
  'description': 'Adjustment disorder with mixed anxiety and depressed mood',
  'score': 0.7260287404060364},
 {'icd': 'F50.9',
  'description': 'Eating disorder, unspecified',
  'score': 0.7231976985931396},
 {'icd': 'F32.9',
  'description': 'Major depressive disorder, single episode, unspecified',
  'score': 0.7231354117393494},
 {'icd': 'F32.1',
  'description': 'Major depressive disorder, single episode, moderate',
  'score': 0.7104206085205078},
 {'

In [30]:
class ICD_Output(BaseModel):
    ICD10: List[str] = Field(
        default_factory=list,
        description="Final ICD-10 codes selected from the allowed list; return 1-4 most relevant."
    )


In [31]:
# Wrap your existing llm for structured output
icd_structured_llm = llm.with_structured_output(ICD_Output)

icd_selection_prompt = PromptTemplate(
    input_variables=["note", "cpts", "allowed_icds"],
    template=(
        "You are a medical coding assistant. Choose the most appropriate ICD-10 codes "
        "ONLY from the allowed list below, based on the clinical note and CPT context. "
        "Prefer diagnoses that are explicitly supported by the note. Do not guess.\n\n"
        "Clinical note:\n{note}\n\n"
        "CPT context (predicted): {cpts}\n\n"
        "Allowed ICD-10 candidates (code — description):\n{allowed_icds}\n\n"
        "Return JSON matching this schema:\n"
        "{{\"ICD10\": [\"code1\", \"code2\"]}}\n"
        "Constraints:\n"
        "- Pick the fewest codes that fully represent the encounter (typically 1-4).\n"
        "- Do not include screening codes unless the note is purely screening.\n"
        "- Do not include historical/resolved problems unless clearly treated/assessed today.\n"
    )
)


In [32]:
def select_icds_for_note(note_text: str,
                         predicted_cpts: List[str],
                         icd_candidates: List[Dict[str, str]],
                         top_k: int = 15) -> Dict[str, Any]:
    """
    Reranks ICD candidates with embeddings, then asks the LLM to pick final ICDs.
    Returns dict with ranked list and final selection.
    """
    # Step 1: re-rank by semantic similarity
    ranked = rerank_icd_candidates(note_text, icd_candidates, top_k=top_k)

    if not ranked:
        return {"ranked": [], "final": []}

    # Prepare allowed list text for the prompt (top-K only)
    allowed_lines = [f"- {r['icd']} — {r['description']} (score: {r['score']:.3f})" for r in ranked]
    allowed_icds_block = "\n".join(allowed_lines)

    # Step 2: LLM final selection (structured)
    prompt_str = icd_selection_prompt.format(
        note=note_text,
        cpts=", ".join(predicted_cpts),
        allowed_icds=allowed_icds_block
    )
    final = icd_structured_llm.invoke(prompt_str)

    return {
        "ranked": ranked,            # list of dicts with score
        "final": final.ICD10         # list of codes selected by the LLM
    }


In [125]:
final_selection = select_icds_for_note(deidentify_note_1, ["90837"], icd_candidadtes)

In [126]:
final_selection["final"]

['F43.21', 'F43.23']

In [34]:
def generate_coding_from_note(file_path: str,
                  cpt_icd_mapping_df: pd.DataFrame,
                  top_k: int = 15) -> Dict[str, Any]:
    """
    Full CPT -> ICD pipeline.
    Input: PDF clinical note path
    Output: dict with CPTs, ranked ICD candidates, and final ICDs
    """
    # Step 1: Load and clean note
    text_content = load_pdf(file_path)
    deidentified_text = deidentify_and_strip(text_content)

    # Step 2: Predict CPT(s)
    predicted_cpts = predict_cpt_code(deidentified_text)

    # Step 3: Build CPT→ICD mapping (from Excel DataFrame)
    cpt_mapping = get_cpt_mapping(cpt_icd_mapping_df)

    # Step 4: Collect ICD candidates
    icd_candidates = get_icd_candidates(predicted_cpts)

    # Normalize ICD candidates for reranker
    # normalized_icds = normalize_icd_candidates(icd_candidates)

    # Step 5: Re-rank candidates and finalize with LLM
    final_selection = select_icds_for_note(
        note_text=deidentified_text,
        predicted_cpts=predicted_cpts,
        icd_candidates=icd_candidadtes,
        top_k=top_k
    )

    return {
        "cpts": predicted_cpts,
        "ranked_icds": final_selection["ranked"],
        "final_icds": final_selection["final"]
    }


In [35]:
files = ["soap_note_1.pdf", "soap_note_2.pdf", "soap_note_3.pdf"]

for file in files:

  result = generate_coding_from_note(file, cpt_icd_mapping_df)

  print(f"\nPredicted CPTs for {file}:")
  for cpt in result["cpts"]:
      print(f" - {cpt}")

  print(f"\nFinal ICDs for {file}:")
  for icd in result["final_icds"]:
      print(f" - {icd}")


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]
Batches:  33%|███▎      | 1/3 [00:03<00:06,  3.21s/it]


KeyboardInterrupt: 

In [36]:
import zipfile
import os
from tqdm import tqdm

def unzip_folder(zip_path, extract_to):
  os.makedirs(extract_to, exist_ok=True)
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
      zip_ref.extractall(extract_to)

unzip_folder("soap_notes_dir.zip", "/content")

In [None]:
predictions = []

for file in tqdm(os.listdir("/content/soap_notes_dir"), desc="Processing SOAP Notes"):
  if file.endswith(".pdf"):
    result = generate_coding_from_note(f"/content/soap_notes_dir/{file}", cpt_icd_mapping_df)
    predictions.append({
        "file": file,
        "cpts": result["cpts"],
        "final_icds": result["final_icds"]
    })


In [None]:
predictions_df = pd.DataFrame(predictions)
predictions_df

In [37]:
# required headings
required_sections = [
    "Interventions Used",
    "Risk Assessment",
    "Current Mental Status"
]

def check_note(note_text: str, filename: str):
    missing = [section for section in required_sections if section not in note_text]

    if missing:
        # Flag as red + record details
        return {
            "filename": filename,
            "status": "RED",
            "missing_sections": missing
        }
    else:
        return {
            "filename": filename,
            "status": "OK",
            "missing_sections": []
        }

In [38]:
flagged_notes = []


for file in tqdm(os.listdir("flag_red_notes_dir")):
    if file.endswith(".pdf"):
        with open(f"flag_red_notes_dir/{file}", "r") as f:
            text_content = load_pdf(f"flag_red_notes_dir/{file}")
            # cleaned_text = deidentify_and_strip(text_content)
            
            result = check_note(text_content, file)
            if result["status"] == "RED":
                flagged_notes.append(result)

100%|██████████| 23/23 [00:02<00:00, 11.37it/s]


In [39]:
flagged_notes

[{'filename': 'TN_Note-for-EM-8-15-2025_created-8-27-2025_58397458.pdf',
  'status': 'RED',
  'missing_sections': ['Interventions Used']},
 {'filename': 'TN_Note-for-GR-8-26-2025_created-8-27-2025_58397747.pdf',
  'status': 'RED',
  'missing_sections': ['Risk Assessment']},
 {'filename': 'TN_Note-for-RZ-8-19-2025_created-8-27-2025_58397468.pdf',
  'status': 'RED',
  'missing_sections': ['Interventions Used', 'Risk Assessment']}]

In [40]:
flagged_files = [n["filename"] for n in flagged_notes]
flagged_files

['TN_Note-for-EM-8-15-2025_created-8-27-2025_58397458.pdf',
 'TN_Note-for-GR-8-26-2025_created-8-27-2025_58397747.pdf',
 'TN_Note-for-RZ-8-19-2025_created-8-27-2025_58397468.pdf']

In [41]:
predictions = []

folder_name = "flag_red_notes_dir"

for file in tqdm(os.listdir(folder_name), desc="Processing SOAP Notes"):
  if file.endswith(".pdf") and file not in flagged_files:
    result = generate_coding_from_note(f"{folder_name}/{file}", cpt_icd_mapping_df)
    predictions.append({
        "file": file,
        "cpts": result["cpts"],
        "final_icds": result["final_icds"]
    })


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it] ?it/s]
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.19s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.31s/it]:59, 32.70s/it]
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.32s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]:16, 23.65s/it]
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.17s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.11s/it]:02, 18.10s/it]
Batches: 100%|██████████| 3/3 [00:07<00:00,  2.41s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.82s/it]:12, 16.45s/it]
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.23s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it]:00, 16.69s/it]
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.13s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]:07, 18.07s/it]
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.02s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it]:03, 15.22s/it]
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.24s/it]
Batches:

In [43]:
predictions_df = pd.DataFrame(predictions)
predictions_df

Unnamed: 0,file,cpts,final_icds
0,TN_Note-for-AB-8-22-2025_created-8-27-2025_583...,[90837],"[F41.9, F43.10]"
1,TN_Note-for-AM-8-20-2025_created-8-27-2025_583...,[90837],[F43.12]
2,TN_Note-for-AN-8-21-2025_created-8-27-2025_583...,[90837],[F43.12]
3,TN_Note-for-AR-8-22-2025_created-8-27-2025_583...,[90832],"[F43.24, Z63.5]"
4,TN_Note-for-AU-8-21-2025_created-8-27-2025_583...,[90837],[F43.12]
5,TN_Note-for-CLD-8-26-2025_created-8-27-2025_58...,[90837],[F43.23]
6,TN_Note-for-CM-8-22-2025_created-8-27-2025_583...,[90837],[F90.2]
7,TN_Note-for-EB-8-22-2025_created-8-27-2025_583...,[90837],[F41.9]
8,TN_Note-for-HM-8-22-2025_created-8-27-2025_583...,[90837],[F32.1]
9,TN_Note-for-IH-8-21-2025_created-8-27-2025_583...,[90837],"[F41.1, F33.0, F90.2]"


In [46]:
text = load_pdf(r"flag_red_notes_dir\TN_Note-for-AM-8-20-2025_created-8-27-2025_58397489.pdf")
clean = deidentify_and_strip(text)

print(clean)

Diagnosis
F43.12Post-Traumatic Stress Disorder, Chronic
Alena experiences irritability, difficulty falling and staying asleep, difficulty concentrating, hypervigilance, high startle reflex, emotional
dysregulation, negative thoughts about herself, and somatic symptoms including headaches and stomach aches. She also becomes
very upset when she perceives people might be fighting, even if the fighting is play. Symptoms are a result of the domestic violence
exposure have cause significant impairment in her functioning at home and at school. Writer finds that Alena meets diagnostic criteria
for post-traumatic stress disorder.
Current Mental Status
Orientation: X3: Oriented to Person, Place, and Time
General Appearance: Appropriate
Dress: Appropriate
Motor Activity: Unremarkable
Interview Behavior: Appropriate
Speech: Normal
Mood: Euthymic
Affect: Congruent
Insight: Good
Judgment/Impulse Control: Good
Memory: Intact
Attention/Concentration: Distractible
Thought Process: Unremarkable
Thought 

In [45]:
cpt = predict_cpt_code(clean)
cpt

['90837']