In [3]:
# In a notebook cell you can prefix with ! but running in a terminal is preferred
!pip install pandas numpy spacy scispacy rapidfuzz icd10 openai transformers sentencepiece python-levenshtein
# If scispaCy fails, fall back to spaCy model:
!python -m spacy download en_core_web_sm
# (Optional) small scispaCy medical model if you want better extraction:
# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz


Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Using cached spacy-3.8.7-cp39-cp39-win_amd64.whl.metadata (28 kB)
Collecting scispacy
  Using cached scispacy-0.6.2-py3-none-any.whl.metadata (20 kB)


ERROR: Could not find a version that satisfies the requirement icd10 (from versions: none)
ERROR: No matching distribution found for icd10
c:\Program Files\Python39\python.exe: No module named spacy


In [5]:
!python -c "import spacy; print('spaCy OK ✅', spacy.__version__)"


spaCy OK ✅ 3.7.5


In [6]:
!python -m spacy download en_core_web_sm


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------

In [8]:
!python -m pip install "numpy>=2.0.0,<2.3.0" --upgrade
!python -m pip install --upgrade opencv-python


Defaulting to user installation because normal site-packages is not writeable
Collecting numpy<2.3.0,>=2.0.0
  Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl.metadata (59 kB)
Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl (15.9 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-2.0.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.2.5 requires numpy<2.0.0,>=1.19.0; python_version >= "3.9", but you have numpy 2.0.2 which is incompatible.


Defaulting to user installation because normal site-packages is not writeable


In [11]:
!python -m pip install scispacy icd10-cm rapidfuzz transformers sentencepiece


Defaulting to user installation because normal site-packages is not writeable
Collecting scispacy
  Using cached scispacy-0.6.2-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp39-cp39-win_amd64.whl.metadata (10 kB)
Collecting conllu (from scispacy)
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Collecting numpy<2.0 (from scispacy)
  Using cached numpy-1.26.4-cp39-cp39-win_amd64.whl.metadata (61 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Downloading nmslib-2.1.2.tar.gz (197 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml)

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.


In [12]:
# Cell 1
import os
import json
from typing import Dict, List, Tuple
import pandas as pd
import numpy as np
import re
from rapidfuzz import process, fuzz
import icd10  # helpful utilities for ICD-10 formatting
# spaCy import
import spacy

# Load spaCy (try clinical model first if installed)
try:
    nlp = spacy.load("en_core_sci_sm")   # scispaCy if available
except Exception:
    nlp = spacy.load("en_core_web_sm")


In [13]:
# Cell 2 - sample EHR / encounter dict
sample_encounter = {
    "patient_id": "P001",
    "name": "John Doe",
    "age": 67,
    "sex": "M",
    "encounter_date": "2025-10-28",
    "presenting_complaint": "Shortness of breath and productive cough x5 days",
    "vitals": {"bp": "140/85", "hr": 102, "rr": 22, "temp_c": 38.1, "spo2": 90},
    "past_medical_history": ["hypertension", "type 2 diabetes mellitus"],
    "medications": ["metformin 500 mg BID", "amlodipine 5 mg daily"],
    "exam": "Crackles at right lower lung; reduced air entry bilaterally",
    "investigations": {"CXR": "right lower zone consolidation", "WBC": "15.2 x10^9/L"},
    "doctor_notes": "Suspect community acquired pneumonia. Start IV antibiotics; give oxygen to maintain sats >92%."
}


In [14]:
# Cell 3— Deterministic clinical note generator (baseline SOAP)
def generate_template_note(data: Dict) -> str:
    subj = f"Subjective: {data.get('presenting_complaint','')}\n"
    obj_lines = []
    vitals = data.get("vitals", {})
    vitals_str = ", ".join([f"{k}: {v}" for k,v in vitals.items()])
    obj_lines.append(f"Vitals: {vitals_str}")
    if data.get("exam"):
        obj_lines.append(f"Exam: {data['exam']}")
    for k,v in data.get("investigations", {}).items():
        obj_lines.append(f"{k}: {v}")
    obj = "Objective:\n" + "\n".join(obj_lines) + "\n"
    assess = "Assessment: " + (data.get("doctor_notes","") or "Assessment not provided.") + "\n"
    plan = "Plan:\n- Documented interventions and investigations.\n"
    plan += "- Follow-up and review as indicated.\n"
    meta = f"Encounter Date: {data.get('encounter_date')} | Patient: {data.get('name')} (ID: {data.get('patient_id')})\n"
    return meta + "\n" + subj + obj + assess + plan

# quick test
print(generate_template_note(sample_encounter))


Encounter Date: 2025-10-28 | Patient: John Doe (ID: P001)

Subjective: Shortness of breath and productive cough x5 days
Objective:
Vitals: bp: 140/85, hr: 102, rr: 22, temp_c: 38.1, spo2: 90
Exam: Crackles at right lower lung; reduced air entry bilaterally
CXR: right lower zone consolidation
WBC: 15.2 x10^9/L
Assessment: Suspect community acquired pneumonia. Start IV antibiotics; give oxygen to maintain sats >92%.
Plan:
- Documented interventions and investigations.
- Follow-up and review as indicated.



In [15]:
# Cell 5
def extract_concepts(text: str) -> List[str]:
    doc = nlp(text)
    ents = [ent.text.lower().strip() for ent in doc.ents]
    # Add simple rule-based finds for common clinical keywords
    keywords = re.findall(r"\b(pneumonia|cough|shortness of breath|dyspnea|hypertension|diabetes|fever|crackles|consolidation)\b", text.lower())
    concepts = list(dict.fromkeys(ents + keywords))  # dedupe preserving order
    return concepts

# quick extraction
note = generate_template_note(sample_encounter)
concepts = extract_concepts(note + "\n" + sample_encounter["doctor_notes"])
print("Concepts:", concepts)


Concepts: ['2025-10-28', 'john doe', 'x5 days', '140/85', '102', '22', '38.1', 'spo2', '90', 'cxr', 'wbc', '15.2', '9/l\nassessment', 'iv', '92%', 'suspect', 'shortness of breath', 'cough', 'crackles', 'consolidation', 'pneumonia']


In [16]:
# Cell 6
# Tiny example ICD dictionary (extend or load full WHO ICD-10 CSV in prod)
icd_demo = {
    "pneumonia": "J18.9|Pneumonia, unspecified organism",
    "essential (primary) hypertension": "I10|Essential (primary) hypertension",
    "type 2 diabetes mellitus": "E11.9|Type 2 diabetes mellitus without complications",
    "shortness of breath": "R06.02|Shortness of breath",
    "acute bronchitis": "J20.9|Acute bronchitis, unspecified"
}

# prepare lookup lists
icd_descs = list(icd_demo.keys())

def map_concepts_to_icd(concepts: List[str], score_threshold: int=60) -> List[Tuple[str,float,str]]:
    """
    Returns list of (ICD_code, confidence (0-1), matched_description)
    """
    results = []
    for c in concepts:
        match = process.extractOne(c, icd_descs, scorer=fuzz.WRatio)
        if match:
            matched_text, score, _ = match
            if score >= score_threshold:
                code_and_desc = icd_demo[matched_text]
                code = code_and_desc.split("|")[0]
                results.append((code, score/100.0, matched_text))
    # dedupe keep highest score
    dedup = {}
    for code,score,desc in results:
        if code not in dedup or dedup[code][0] < score:
            dedup[code] = (score, desc)
    return [(code, dedup[code][0], dedup[code][1]) for code in dedup]

# quick test
icd_suggestions = map_concepts_to_icd(concepts)
print("ICD suggestions:", icd_suggestions)


ICD suggestions: [('R06.02', 1.0, 'shortness of breath'), ('J18.9', 1.0, 'pneumonia')]


In [17]:
# Cell 7— Full pipeline function (run for every encounter)
def process_encounter(enc: Dict, run_llm: bool=False) -> Dict:
    # 1) generate note (template or LLM if enabled)
    note = generate_template_note(enc)
    # if run_llm:
    #     try:
    #         note = generate_note_with_openai(enc)
    #     except Exception as e:
    #         print("LLM call failed, falling back to template:", e)
    # 2) concept extraction
    concepts = extract_concepts(note + "\n" + enc.get("doctor_notes",""))
    # 3) map to ICD
    icd_suggestions = map_concepts_to_icd(concepts)
    # 4) create evidence spans for each suggested code
    evidence = {}
    for code, conf, desc in icd_suggestions:
        # find snippet containing the description or concept
        pat = re.compile(re.escape(desc), re.IGNORECASE)
        m = pat.search(note + " " + enc.get("doctor_notes",""))
        span = (m.start(), m.end()) if m else None
        evidence[code] = {"confidence": conf, "matched_term": desc, "text_span": span}
    return {
        "patient_id": enc.get("patient_id"),
        "note": note,
        "concepts": concepts,
        "icd_suggestions": icd_suggestions,
        "evidence": evidence
    }

# run pipeline on sample
result = process_encounter(sample_encounter)
print(json.dumps(result, indent=2))


{
  "patient_id": "P001",
  "note": "Encounter Date: 2025-10-28 | Patient: John Doe (ID: P001)\n\nSubjective: Shortness of breath and productive cough x5 days\nObjective:\nVitals: bp: 140/85, hr: 102, rr: 22, temp_c: 38.1, spo2: 90\nExam: Crackles at right lower lung; reduced air entry bilaterally\nCXR: right lower zone consolidation\nWBC: 15.2 x10^9/L\nAssessment: Suspect community acquired pneumonia. Start IV antibiotics; give oxygen to maintain sats >92%.\nPlan:\n- Documented interventions and investigations.\n- Follow-up and review as indicated.\n",
  "concepts": [
    "2025-10-28",
    "john doe",
    "x5 days",
    "140/85",
    "102",
    "22",
    "38.1",
    "spo2",
    "90",
    "cxr",
    "wbc",
    "15.2",
    "9/l\nassessment",
    "iv",
    "92%",
    "suspect",
    "shortness of breath",
    "cough",
    "crackles",
    "consolidation",
    "pneumonia"
  ],
  "icd_suggestions": [
    [
      "R06.02",
      1.0,
      "shortness of breath"
    ],
    [
      "J18.9",
   

In [1]:
#-----------------------------------------------------
# Step 1: Import Libraries
#-----------------------------------------------------
import pytesseract
from PIL import Image
import cv2
import re

# If needed, set Tesseract path (update according to your install)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

#-----------------------------------------------------
# Step 2: Load and Preprocess Enhanced Image
#-----------------------------------------------------
img_path = r"D:\ehr\prescription\29.jpg"
img = cv2.imread(img_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
text = pytesseract.image_to_string(gray)

print("🩺 Extracted Text from Prescription:\n")
print(text)

#-----------------------------------------------------
# Step 3: Extract Structured Information
#-----------------------------------------------------
# Extract patient name
name = re.search(r"MR\.?\s*(\w+\s*\w*)", text, re.IGNORECASE)
name = name.group(1) if name else "Unknown"

# Extract age
age = re.search(r"(\d{1,2})/?M", text, re.IGNORECASE)
age = age.group(1) if age else "Unknown"

# Extract diagnosis (e.g., COVID)
diagnosis = re.search(r"CC[:\s]*([A-Za-z\s\+\-]+)", text, re.IGNORECASE)
diagnosis = diagnosis.group(1).strip() if diagnosis else "Unknown"

# Extract medicines
meds = re.findall(r"(Inj\.?\s*[A-Za-z]+\s*\d*\s*mg[^.\n]*)", text)
meds = meds if meds else ["No medications found"]

#-----------------------------------------------------
# Step 4: ICD-10 Mapping (Example Dictionary)
#-----------------------------------------------------
icd10_map = {
    "COVID": "U07.1",
    "Diabetes": "E11.9",
    "Hypertension": "I10",
    "Asthma": "J45.9"
}

icd_code = "Not Found"
for disease, code in icd10_map.items():
    if disease.lower() in diagnosis.lower():
        icd_code = code
        break

#-----------------------------------------------------
# Step 5: Generate Clinical Note
#-----------------------------------------------------
clinical_note = f"""
----------------- 🏥 Clinical Note -----------------
Patient Name   : {name}
Age / Gender   : {age} / M
Diagnosis      : {diagnosis}
ICD-10 Code    : {icd_code}

Prescribed Medications:
- {chr(10).join(meds)}

Doctor         : Dr. Ajay Sahdev
----------------------------------------------------
"""

print(clinical_note)


🩺 Extracted Text from Prescription:

DD fonm 1289
1 NOV 71
DOD PRESCRIPTION
FOR (Full name, address, & phone number) (If under 12, give age)
John R Doe, HM3, VSN

US.S. Neverforgotten (0D 178) _

MEDICAL FACILITY DATE
US.S. Weverforgotten (00 178) 23 Nan

B. Guperscription) gm or ml.
(nscription)
Th Blbadewra
Anphege goed
(Subscription)
Wt fl Probation”
(Signe)

Se Sm@ tid ac

LOT NO: 39K /06
fockR. Frost
LODE. WD, USWA
SIGNATURE RANK AND DEGREE

EDITION OF 1 JAN 60 MAY BE USED FOR
S/N 0102-LF.012-6201



----------------- 🏥 Clinical Note -----------------
Patient Name   : Unknown
Age / Gender   : Unknown / M
Diagnosis      : Unknown
ICD-10 Code    : Not Found

Prescribed Medications:
- No medications found

Doctor         : Dr. Ajay Sahdev
----------------------------------------------------

