In [1]:
# Create working directory (only runs once)
!mkdir -p clinical_note_gen
%cd clinical_note_gen

# Verify dataset location
!ls -R /content/Dataset.zip | head -n 20


/content/clinical_note_gen
/content/Dataset.zip


In [2]:
!pip install easyocr transformers torch tqdm pandas -q


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/2.9 MB[0m [31m15.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.9/2.9 MB[0m [31m49.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m963.8/963.8 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.6/300.6 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import easyocr

DATA_PATH = '/content/Dataset'
OUTPUT_PATH = './results_M3'
os.makedirs(OUTPUT_PATH, exist_ok=True)

print("✅ Current working directory:", os.getcwd())
print("✅ Data path:", DATA_PATH)


✅ Current working directory: /content/clinical_note_gen
✅ Data path: /content/Dataset


In [4]:
!pip install easyocr transformers torch pandas tqdm

import zipfile, os

zip_path = "/content/Dataset.zip"
extract_path = "/content/Dataset"
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)

print("✅ Dataset extracted to:", extract_path)
!ls -R /content/Dataset | head -n 50


✅ Dataset extracted to: /content/Dataset
/content/Dataset:
Dataset

/content/Dataset/Dataset:
data
images report
merged.csv

/content/Dataset/Dataset/data:
100.jpg
101.jpg
102.jpg
103.jpg
104.jpg
105.jpg
106.jpg
107.jpg
108.jpg
109.jpg
10.jpg
110.jpg
111.jpg
112.jpg
113.jpg
114.jpg
115.jpg
116.jpg
117.jpg
118.jpg
119.jpg
11.jpg
120.jpg
121.jpg
122.jpg
123.jpg
124.jpg
125.jpg
126.jpg
127.jpg
128.jpg
129.jpg
12.jpg
13.jpg
14.jpg
15.jpg
16.jpg
17.jpg
18.jpg
19.jpg
1.jpg


In [5]:
import os

DATA_PATH = "/content/Dataset"
image_paths = []

for root, _, files in os.walk(DATA_PATH):
    for f in files:
        if f.lower().endswith(('.jpg', '.jpeg', '.png')):
            image_paths.append(os.path.join(root, f))

print(f"✅ Found {len(image_paths)} images in Dataset.")
print("🧠 Example paths:", image_paths[:5])


✅ Found 382 images in Dataset.
🧠 Example paths: ['/content/Dataset/Dataset/data/44.jpg', '/content/Dataset/Dataset/data/53.jpg', '/content/Dataset/Dataset/data/113.jpg', '/content/Dataset/Dataset/data/43.jpg', '/content/Dataset/Dataset/data/12.jpg']


In [7]:
import easyocr
from tqdm import tqdm
import pandas as pd

reader = easyocr.Reader(['en'])
data = []

for img_path in tqdm(image_paths, desc="🔍 Extracting text from images"):
    result = reader.readtext(img_path, detail=0)
    text = " ".join(result)
    data.append({"image_path": img_path, "extracted_text": text})

df = pd.DataFrame(data)
df.to_csv("/content/ocr_results.csv", index=False)
print("✅ OCR complete → /content/ocr_results.csv")
df.head()


🔍 Extracting text from images: 100%|██████████| 382/382 [57:49<00:00,  9.08s/it]

✅ OCR complete → /content/ocr_results.csv





Unnamed: 0,image_path,extracted_text
0,/content/Dataset/Dataset/data/44.jpg,2 hs Z_e/I*n+k* f-#fZ4 [Lzze Ileela 0x0_ T+z1 ...
1,/content/Dataset/Dataset/data/53.jpg,Fortis OPD CONSULTATION Foitis dentr Fit AocRa...
2,/content/Dataset/Dataset/data/113.jpg,Name: Lla Address: 439 Sex; Date: Rx tol # B0 ...
3,/content/Dataset/Dataset/data/43.jpg,t[ Mnnn anuanr Feirlia titaolrntl Dincensina W...
4,/content/Dataset/Dataset/data/12.jpg,5n 2J Et€ Eosp 8u [eevu Wse ~oozz 1C8401 7' t+...


In [8]:
from transformers import pipeline
from tqdm import tqdm

# Pretrained text generation model from Hugging Face
note_generator = pipeline("text2text-generation", model="google/flan-t5-base")

generated_notes = []
for text in tqdm(df['extracted_text'], desc="🩺 Generating clinical notes"):
    if not text.strip():
        generated_notes.append("No text detected in image.")
    else:
        prompt = f"Generate a concise clinical note summarizing this patient information: {text}"
        result = note_generator(prompt, max_length=80, do_sample=True)
        generated_notes.append(result[0]['generated_text'])

df['generated_note'] = generated_notes
df.to_csv("/content/generated_clinical_notes.csv", index=False)
print("✅ Clinical notes generated → /content/generated_clinical_notes.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
🩺 Generating clinical notes:   0%|          | 0/382 [00:00<?, ?it/s]Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
🩺 Generating clinical notes:   0%|          | 1/382 [00:48<5:05:50, 48.16s/it]Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
🩺 Generating clinical notes:   1%|          | 2/382 [01:46<5:43:09, 54.18s/it]Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
🩺 Gener

✅ Clinical notes generated → /content/generated_clinical_notes.csv





In [9]:
from transformers import pipeline

icd_classifier = pipeline("text-classification", model="roberta-large-mnli")

predicted_labels = []
for note in tqdm(df['generated_note'], desc="🏷️ Predicting ICD-10 labels"):
    result = icd_classifier(note)
    predicted_labels.append(result[0]['label'])


df['predicted_icd10'] = predicted_labels
df.to_csv("/content/final_notes_with_icd10.csv", index=False)
print("✅ ICD-10 coding complete → /content/final_notes_with_icd10.csv")



config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
🏷️ Predicting ICD-10 labels: 100%|██████████| 382/382 [07:10<00:00,  1.13s/it]

✅ ICD-10 coding complete → /content/final_notes_with_icd10.csv





In [10]:
import random
sample = df.sample(min(3, len(df)))

for _, row in sample.iterrows():
    print(f"\n🩻 Image: {row['image_path']}")
    print(f"🧾 Extracted Text: {row['extracted_text']}")
    print(f"🧠 Generated Note: {row['generated_note']}")
    print(f"🏷️ ICD-10 Prediction: {row['predicted_icd10']}")



🩻 Image: /content/Dataset/Dataset/data/115.jpg
🧾 Extracted Text: M Manipal Hospital Manipal i*spiaeooy LIFC Dr. Shalina VoaYAc ELSIS A ^ JLO; *S (ENT) MACS (London) DOHNS (London) NT Consulni 66y' Hosc N; 1 588893 4 Ly AiPasomal AAPHo FePECNJ Ly [6 VIALs (eeyl 1 dnsk 30 0 Am Piloste RicN CONvENT(OWA X684 bs [i vac/doy] Iead & Neck Surgery Nursing; Diagnostics and Allied Areas lan Mulli-Superspecialily Hospilal , IS0 9001-2000  Certilied lor Cinical; Fax : 91 80 2526 6757 . Road, Bangalore 560 017 , India Phone 91 80 2502 4444, 2502 3344 Extn. 3278 f Manipal Health Systems Pvt: Ltd. www manipalhospital.com Ray somd sond
🧠 Generated Note: Iead & Neck Surgery Nursing; Diagnostics and Allied Areas Mulli-Superspecialily Hospilal , IS0 9001-2000 Certilied lor Cinical; Fax : 91 80 2526 6757 . Road, Bangalore 560 017 , India Phone : 91 80 2502 4444 , 2502 3344 Extn. 3278 f Manipal Health Systems Pvt. Ltd. www manipalhospital.com
🏷️ ICD-10 Prediction: NEUTRAL

🩻 Image: /content/Dataset/Dataset

In [11]:
final_path = "/content/results_M3.zip"
import shutil
shutil.make_archive("/content/results_M3", 'zip', "/content", "final_notes_with_icd10.csv")
print(f"📦 Final results zipped → {final_path}")


📦 Final results zipped → /content/results_M3.zip
