In [2]:
!pip install pandas numpy spacy rapidfuzz python-Levenshtein --quiet


In [3]:

import importlib, sys
try:
    import spacy
    _ = spacy.load("en_core_web_sm")
except Exception:
    !python -m spacy download en_core_web_sm

# Imports
import os, re, json
import pandas as pd
import numpy as np
import spacy
from rapidfuzz import process, fuzz
from pathlib import Path
from datetime import datetime


In [4]:
# Paths - change these if your files are elsewhere
PATIENT_CSV = "patient.csv"        # your patient metadata
META_DF_CSV = "meta_df.csv"        # generated in Module 1 (image paths + Stroke_Type)
EHR_TEXT_COL = "EHR_Text"          # name of column in patient.csv that contains clinical text notes (if any)

# Load patient metadata
if not Path(PATIENT_CSV).exists():
    raise FileNotFoundError(f"{PATIENT_CSV} not found. Put it in the notebook folder or change PATIENT_CSV path.")

patients = pd.read_csv(PATIENT_CSV, dtype=str)
print("Loaded patients:", patients.shape)

# Load meta_df (images)
if Path(META_DF_CSV).exists():
    meta_df = pd.read_csv(META_DF_CSV, dtype=str)
    print("Loaded meta_df:", meta_df.shape)
else:
    meta_df = pd.DataFrame(columns=["image_path","Stroke_Type"])
    print("meta_df not found - continuing without image linking (you can create meta_df.csv first).")


Loaded patients: (300, 8)
Loaded meta_df: (297, 2)


In [5]:
nlp = spacy.load("en_core_web_sm")

# Expand this symptoms list as needed for stroke-like features
SYMPTOM_KEYWORDS = [
    "headache","nausea","vomiting","vomit","weakness","hemiparesis","hemiplegia",
    "dizziness","dizzy","numbness","numb","slurred speech","speech difficulty",
    "aphasia","confusion","loss of consciousness","seizure","vision loss","blurry vision",
    "droop","facial droop","imbalance","ataxia"
]

# helper: normalize text
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s)
    s = re.sub(r'\s+', ' ', s.strip())
    return s


In [6]:
def extract_symptoms_and_entities(text):
    """
    Returns:
      - symptoms_found: list of symptom keywords
      - entities: spaCy entities (list of (text,label))
    """
    text = clean_text(text)
    doc = nlp(text)
    ents = [(ent.text, ent.label_) for ent in doc.ents]
    lowered = text.lower()
    symptoms_found = []
    for kw in SYMPTOM_KEYWORDS:
        if kw in lowered:
            symptoms_found.append(kw)
    # also attempt to glean short phrases that look like symptoms (noun chunks with keywords)
    return {"symptoms": list(dict.fromkeys(symptoms_found)), "entities": ents}

# quick test
# print(extract_symptoms_and_entities("Patient with sudden left-side weakness and slurred speech, onset 2 hours ago."))


In [7]:
# If you have a local ICD10 master CSV, put its path here.
# Expected minimal format: columns ['code','description']
ICD10_CSV = "icd10_codes.csv"   # optional: if present, will be used for mapping

if Path(ICD10_CSV).exists():
    icd_df = pd.read_csv(ICD10_CSV, dtype=str).fillna("")
    # lower descriptions for fuzzy matching
    icd_df["desc_lower"] = icd_df["description"].str.lower()
    print("Loaded ICD-10 reference:", icd_df.shape)
else:
    # fallback small mapping for stroke-related conditions (expand as required)
    fallback = [
        {"code":"I63.9", "description":"Cerebral infarction, unspecified"},
        {"code":"I61.9", "description":"Intracerebral hemorrhage, unspecified"},
        {"code":"G45.9", "description":"Transient cerebral ischemic attack, unspecified"},
        {"code":"I64",   "description":"Stroke, not specified as haemorrhage or infarction"},
        {"code":"R41.0", "description":"Disorientation, unspecified"},
        {"code":"R47.01","description":"Aphasia"},
    ]
    icd_df = pd.DataFrame(fallback)
    icd_df["desc_lower"] = icd_df["description"].str.lower()
    print("Using fallback ICD-10 mapping:", icd_df.shape)


Using fallback ICD-10 mapping: (6, 3)


In [8]:
def map_text_to_icd(text, top_k=3, score_threshold=65):
    """
    Map free text to ICD10 codes using:
      1) direct keyword rules (fast)
      2) fuzzy-match against icd_df descriptions (rapidfuzz)
    Returns list of (code, description, score)
    """
    text = clean_text(text).lower()
    if text == "":
        return []

    candidates = []

    # Rule-based quick checks
    if any(x in text for x in ["haemorrhag", "intracerebral hemorrhage", "bleed", "hematoma","hemorrhagic"]):
        candidates.append(("I61.9", "Intracerebral hemorrhage, unspecified", 100))
    if any(x in text for x in ["ischemi", "infarct", "cerebral infarction", "ischemic"]):
        candidates.append(("I63.9", "Cerebral infarction, unspecified", 100))
    if any(x in text for x in ["tia", "transient ischemic attack", "transient ischemic"]):
        candidates.append(("G45.9","Transient cerebral ischemic attack, unspecified", 95))
    if any(x in text for x in ["aphasia","speech","slurred speech"]):
        candidates.append(("R47.01","Aphasia", 90))
    if any(x in text for x in ["confusion","disorientation"]):
        candidates.append(("R41.0","Disorientation, unspecified", 80))

    # Fuzzy-match against icd_df descriptions
    # use top matches from rapidfuzz process.extract
    choices = icd_df["desc_lower"].tolist()
    matches = process.extract(text, choices, scorer=fuzz.token_sort_ratio, limit=top_k)
    for match_text, score, idx in matches:
        if score >= score_threshold:
            row = icd_df.iloc[idx]
            candidates.append((row["code"], row["description"], int(score)))

    # de-duplicate preserve highest score
    uniq = {}
    for code, desc, score in candidates:
        if code not in uniq or score > uniq[code][1]:
            uniq[code] = (desc, score)
    out = [(code, uniq[code][0], uniq[code][1]) for code in uniq]
    # sort by score desc
    out = sorted(out, key=lambda x: x[2], reverse=True)
    return out

# quick test
# print(map_text_to_icd("Patient with sudden left hemiparesis, suspected ischemic infarct"))


In [9]:
def generate_clinical_note(patient_row, findings_text="", ehr_text=""):
    """
    patient_row: a pandas Series from patients dataframe
    findings_text: e.g., image findings or enhancement summary
    ehr_text: raw EHR / doctor notes text
    """
    pid = patient_row.get("Patient_ID", "Unknown")
    age = patient_row.get("Age", "")
    gender = patient_row.get("Gender", "")
    stroke_type = patient_row.get("Stroke_Type", "")
    ward = patient_row.get("Ward_ID", "")
    date_of_scan = patient_row.get("Date_of_Scan", "")
    now = datetime.now().strftime("%Y-%m-%d %H:%M")

    # Extract symptoms & entities from ehr_text + findings
    combined = " ".join([str(ehr_text or ""), str(findings_text or "")])
    ext = extract_symptoms_and_entities(combined)
    symptoms = ext["symptoms"]
    entities = ext["entities"]

    # Basic templated note
    note_lines = []
    note_lines.append(f"Patient ID: {pid}")
    note_lines.append(f"Age/Gender: {age} / {gender}")
    note_lines.append(f"Ward: {ward}")
    note_lines.append(f"Date of Scan: {date_of_scan}")
    note_lines.append(f"Assessment time: {now}")
    note_lines.append("")
    note_lines.append("Clinical presentation:")
    if symptoms:
        note_lines.append(" - Reported symptoms: " + ", ".join(symptoms))
    else:
        note_lines.append(" - No symptoms described in EHR notes.")
    note_lines.append("")
    note_lines.append("Imaging Impression:")
    if stroke_type:
        note_lines.append(f" - Stroke classification (metadata): {stroke_type}")
    if findings_text:
        note_lines.append(" - Image findings: " + findings_text)
    else:
        note_lines.append(" - No image findings text available.")
    note_lines.append("")
    note_lines.append("Provisional ICD suggestions: (see mapping below)")
    # will be appended separately
    note = "\n".join(note_lines)
    return {"note": note, "symptoms": symptoms, "entities": entities}

# Optional: If you have Azure/OpenAI credentials, you can call a GenAI model to refine note:
# def call_azure_openai_refine(prompt, azure_endpoint, azure_key, deployment="gpt-4o-mini"):
#     # This is a placeholder. Use Azure/OpenAI SDK or REST API per docs.
#     pass


In [10]:
results = []
for idx, prow in patients.iterrows():
    pid = prow.get("Patient_ID", f"PID_{idx}")
    # locate associated images (if meta_df exists)
    imgs = meta_df[meta_df["image_path"].str.contains(pid, na=False, case=False)] if not meta_df.empty else pd.DataFrame()
    # create simple findings text from image labels (if available)
    findings_text = ""
    if not imgs.empty:
        # summarize stroke types from mapped images
        top_types = imgs["Stroke_Type"].value_counts().to_dict()
        findings_text = "Images mapped: " + ", ".join([f"{k}({v})" for k,v in top_types.items()])
    elif prow.get("Stroke_Type"):
        findings_text = f"Metadata stroke_type = {prow.get('Stroke_Type')}"

    # EHR text if exists
    ehr_text = prow.get(EHR_TEXT_COL, "" ) if EHR_TEXT_COL in patients.columns else ""

    generated = generate_clinical_note(prow, findings_text=findings_text, ehr_text=ehr_text)
    # Compose text to map ICDs - use combined fields
    map_text = " ".join([clean_text(ehr_text), clean_text(findings_text), clean_text(str(prow.get("Stroke_Type","")))])

    icd_suggestions = map_text_to_icd(map_text, top_k=5, score_threshold=60)

    results.append({
        "Patient_ID": pid,
        "Age": prow.get("Age",""),
        "Gender": prow.get("Gender",""),
        "Stroke_Type": prow.get("Stroke_Type",""),
        "EHR_Text": ehr_text,
        "Image_Findings": findings_text,
        "Extracted_Symptoms": "|".join(generated["symptoms"]),
        "Extracted_Entities": json.dumps(generated["entities"]),
        "Clinical_Note": generated["note"],
        "ICD_Suggestions": json.dumps(icd_suggestions)
    })

notes_df = pd.DataFrame(results)
print("Generated clinical notes for:", notes_df.shape[0])
notes_df.head(3)


Generated clinical notes for: 300


Unnamed: 0,Patient_ID,Age,Gender,Stroke_Type,EHR_Text,Image_Findings,Extracted_Symptoms,Extracted_Entities,Clinical_Note,ICD_Suggestions
0,Patient_001,65,F,Normal,,Images mapped: Normal(1),,[],Patient ID: Patient_001\nAge/Gender: 65 / F\nW...,[]
1,Patient_002,41,F,Normal,,Images mapped: Normal(1),,[],Patient ID: Patient_002\nAge/Gender: 41 / F\nW...,[]
2,Patient_003,64,M,Haemorrhagic,,Images mapped: Haemorrhagic(1),,[],Patient ID: Patient_003\nAge/Gender: 64 / M\nW...,"[[""I61.9"", ""Intracerebral hemorrhage, unspecif..."


In [11]:
OUT_NOTES = "clinical_notes.csv"
OUT_ICD = "icd_mapped.csv"

notes_df.to_csv(OUT_NOTES, index=False)
# Expand ICD suggestions into normalized table for review
rows = []
for _, r in notes_df.iterrows():
    pid = r["Patient_ID"]
    sug = r["ICD_Suggestions"]
    try:
        arr = json.loads(sug)
    except:
        arr = []
    if not arr:
        rows.append({"Patient_ID": pid, "icd_code": "", "icd_desc": "", "score": ""})
    else:
        for code, desc, score in arr:
            rows.append({"Patient_ID": pid, "icd_code": code, "icd_desc": desc, "score": score})

icd_table = pd.DataFrame(rows)
icd_table.to_csv(OUT_ICD, index=False)

print("Saved clinical notes ->", OUT_NOTES)
print("Saved ICD suggestions ->", OUT_ICD)


Saved clinical notes -> clinical_notes.csv
Saved ICD suggestions -> icd_mapped.csv


In [12]:
# Quick counts
print("Notes saved:", Path(OUT_NOTES).exists())
print("ICD saved:", Path(OUT_ICD).exists())

# Print sample note for manual review (first patient)
if len(notes_df)>0:
    print("\n=== SAMPLE NOTE (Patient 0) ===\n")
    print(notes_df.iloc[0]["Clinical_Note"])
    print("\nICD suggestions:", notes_df.iloc[0]["ICD_Suggestions"])


Notes saved: True
ICD saved: True

=== SAMPLE NOTE (Patient 0) ===

Patient ID: Patient_001
Age/Gender: 65 / F
Ward: 2
Date of Scan: 07-08-2025
Assessment time: 2025-11-14 23:30

Clinical presentation:
 - No symptoms described in EHR notes.

Imaging Impression:
 - Stroke classification (metadata): Normal
 - Image findings: Images mapped: Normal(1)

Provisional ICD suggestions: (see mapping below)

ICD suggestions: []
