<a href="https://colab.research.google.com/github/KusalaniR/MedGen.AI/blob/main/notebooks/ocr_gemini_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Install OCR tools
# System OCR engine
!apt-get install -y tesseract-ocr poppler-utils

# Python OCR libraries
!pip install pytesseract pdf2image pillow pandas google-generativeai



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 1 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 186 kB in 1s (225 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 117528 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloadin

In [3]:
# -------------------------------
# Import libraries
# -------------------------------
import pytesseract                  # OCR engine
from pdf2image import convert_from_path  # Convert PDF pages to images
from PIL import Image
import pandas as pd                 # Data handling
import re                            # Regex for pattern matching
import os
import textwrap                     # Formatting fallback explanation

In [4]:
#upload pdf
from google.colab import files

uploaded = files.upload()

# Automatically get uploaded filename
pdf_file = list(uploaded.keys())[0]
print("Uploaded file:", pdf_file)


Saving report.pdf to report.pdf
Uploaded file: report.pdf


In [5]:
#convert pdf to images
pages = convert_from_path(pdf_file)

print(f"Total pages detected: {len(pages)}")


Total pages detected: 4


In [6]:
#OCR for each page
extracted_text = ""

for i, page in enumerate(pages):
    text = pytesseract.image_to_string(page)
    extracted_text += f"\n--- Page {i+1} ---\n{text}"

print(extracted_text)



--- Page 1 ---
  

MATARA

Sample: “ppg
Patient 1D: DMC 397

Date of Analysie: 9)9/2075 #59 AM
Wand: x

 
      
       
 
   

Ref, Ranges

     
 

4.00 - 10,08

  

Low.

      
  

346 10°) 2.06- 7.00
L8l row 0.80- 4.00
O43 leon

 

 

 

 

  

 

0.66 1A,
aa 0.02 19-9A.
0.00 1a
$435
4%
67%
103%
03° % 10-1,0
Or % 10 - 100.0
+ RBC 437 WONT 3.50 - 550
HGB 126 gid. 11.8 - 16.0
6 HET 390% 37.0- $4.0
MCV 92 30.0 - 100.6
if MCH 287 pe 270-340
19 MCHC 322 gid 32.0-36.0
20 RDW-C¥ 0.143 0.110- 0.160
21 RDW-SD 8 0 350-569
22 PLT 227 10°90 108 - 300
23 MPV 98 1 69-120
24 PDW 163 159-170
25 pet 221 mbt 98-282 jl
26 PALO & 1S. 30-90
1OOWB 0.00 - 9999
DIF v
LY
| nam 4
= r
f
#
Medical Laboratory Technologist
perated User
Delivered by Y Time of Printing: 9/9/2025 8:59AM.) 4

Order Time

 

--- Page 2 ---
  

 
 
   

IVIVITULLI og Li
ena
ern | ee

FF Uy 2
wledila , z
L LABORATORY REPORT  hecurance Partner Stolaboy
7 SQUALAB

4
co DENT hn a BIOLABS | Thenmo Scientific

G®o6m89 cOH ed SSA EIDENTI

In [7]:
#Extract Test Names + Values (Regex)
pattern = r"([A-Za-z\-\s]+)\s+([\d]+\.?\d*)"
matches = re.findall(pattern, extracted_text)

structured_df = pd.DataFrame(matches, columns=["raw_test_name", "value"])
structured_df


Unnamed: 0,raw_test_name,value
0,\n--- Page,1
1,ppg\nPatient,1
2,DMC,397
3,Ranges\n\n \n \n,4.00
4,-,10
...,...,...
67,-,85
68,-,160
69,\nVLDL,208
70,HDL,2.99


In [8]:
#To fix OCR numeric scaling
def normalize_value(test_name, value):
    if test_name in ["MCH", "MCHC", "RBC"] and value > 100:
        return value / 10
    return value
    # value = normalize_value(test, value)
    status = classify_status(test, value)
    knowledge = retrieve_knowledge(test)



In [9]:
#Clean OCR noice (clean junk rows)
VALID_TESTS = {
    "hemoglobin": "Hemoglobin",
    "hb": "Hemoglobin",
    "wbc": "WBC",
    "white blood": "WBC",
    "rbc": "RBC",
    "platelet": "Platelet Count",
    "plt": "Platelet Count",
    "mcv": "MCV",
    "mch": "MCH",
    "mchc": "MCHC",
    "rdw": "RDW",
    "cholesterol": "Cholesterol",
    "glucose": "Glucose",
    "triglyceride": "Triglycerides",
    "beta-hcg": "Beta-HCG",
    "bhcg": "Beta-HCG"
}

def clean_test_name(raw):
    raw = raw.lower().strip()
    for key, name in VALID_TESTS.items():
        if key in raw:
            return name
    return None

structured_df["test_name"] = structured_df["raw_test_name"].apply(clean_test_name)
structured_df = structured_df[structured_df["test_name"].notnull()]
structured_df["value"] = structured_df["value"].astype(float)
structured_df = structured_df.reset_index(drop=True)

structured_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  structured_df["value"] = structured_df["value"].astype(float)


Unnamed: 0,raw_test_name,value,test_name
0,RBC,437.0,RBC
1,\nMCV,92.0,MCV
2,\nif MCH,287.0,MCH
3,MCHC,322.0,MCH
4,RDW-SD,8.0,RDW
5,PLT,227.0,Platelet Count
6,indiko Plus Fully Automated Chemistry Analyze...,194.0,Cholesterol


In [10]:
#Load Reference Ranges (RULE-BASED)
from google.colab import drive
drive.mount("/content/drive")

ranges_df = pd.read_csv(
    "/content/drive/MyDrive/MedGen.AI Datasets/FINALIZED DATASETS/blood_test_rules.csv"
)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
def classify_status(test_name, value):
    row = ranges_df[ranges_df["test_name"] == test_name]

    if row.empty:
        return "Unknown"

    low = row.iloc[0]["low_threshold"]
    high = row.iloc[0]["high_threshold"]

    if value < low:
        return "Low"
    elif value > high:
        return "High"
    else:
        return "Normal"


In [12]:
# ---------------------------------------------
# XAI: Explain WHY a value is Low / Normal / High
# ---------------------------------------------
#Add XAI Explanation Function
def explain_why(test_name, value, status):
    """
    Rule-based explainable AI (XAI) logic.
    Explains WHY the test result is Low / Normal / High.
    """

    # Get reference range
    row = ranges_df[ranges_df["test_name"] == test_name]

    if row.empty:
        return "Reference range is not available for this test."

    low = row.iloc[0]["low_threshold"]
    high = row.iloc[0]["high_threshold"]

    # Explanation based on status
    if status == "Low":
        return (
            f"The normal range for {test_name} is {low} to {high}. "
            f"Your value ({value}) is below the lower limit, which is why it is classified as Low."
        )

    elif status == "High":
        return (
            f"The normal range for {test_name} is {low} to {high}. "
            f"Your value ({value}) is above the upper limit, which is why it is classified as High."
        )

    elif status == "Normal":
        return (
            f"The normal range for {test_name} is {low} to {high}. "
            f"Your value ({value}) falls within this range, so it is considered Normal."
        )

    else:
        return "Status could not be determined."


In [13]:
#Load medical knowledge
knowledge_df = pd.read_csv(
    "/content/drive/MyDrive/MedGen.AI Datasets/FINALIZED DATASETS/blood_test_knowledge.csv",
    encoding="latin1"
)

def retrieve_knowledge(test_name):
    row = knowledge_df[knowledge_df["test_name"] == test_name]

    if row.empty:
        return "No explanation available."

    return row.iloc[0]["simple_explanation_en"]



In [14]:
print("Ranges columns:", ranges_df.columns.tolist())
print("Knowledge columns:", knowledge_df.columns.tolist())


Ranges columns: ['test_name', 'gender', 'low_threshold', 'high_threshold', 'unit']
Knowledge columns: ['test_name', 'normal_range', 'unit', 'low_meaning', 'high_meaning', 'simple_explanation_en']


In [15]:
#Configure Gemini + Fallback
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted
from google.colab import userdata

genai.configure(api_key=userdata.get("GEMINI_API_KEY"))
model = genai.GenerativeModel("models/gemini-flash-lite-latest")


In [16]:
# ----------------------------------
# Language selection (Bilingual)
# ----------------------------------
# "en" = English
# "si" = Sinhala
user_language = "si"


In [17]:
def translate_to_sinhala(text):
    """
    Translates English explanation to simple Sinhala using Gemini.
    """

    prompt = f"""
    Translate the following medical explanation into SIMPLE Sinhala.
    Use easy words.
    Do NOT add diagnosis.
    Do NOT suggest medicines.

    Text:
    {text}
    """

    try:
        response = model.generate_content(prompt)
        return response.text
    except:
        return text  # fallback to English if error


In [18]:
def fallback_explanation(test, value, status, knowledge):
    return textwrap.dedent(f"""
    Test: {test}
    Value: {value}
    Status: {status}

    Explanation:
    {knowledge}

    Note: This is educational only.
    """)


In [19]:
def gemini_explanation(test, value, status, knowledge):
    prompt = f"""
    Explain this blood test in very simple language.

    Test: {test}
    Value: {value}
    Status: {status}

    Knowledge:
    {knowledge}

    Rules:
    - No diagnosis
    - No medicine
    - Simple words
    """
    return model.generate_content(prompt).text


In [20]:
def generate_explanation(test, value, status, knowledge):
    try:
        return gemini_explanation(test, value, status, knowledge)
    except:
        return fallback_explanation(test, value, status, knowledge)




In [21]:
structured_df


Unnamed: 0,raw_test_name,value,test_name
0,RBC,437.0,RBC
1,\nMCV,92.0,MCV
2,\nif MCH,287.0,MCH
3,MCHC,322.0,MCH
4,RDW-SD,8.0,RDW
5,PLT,227.0,Platelet Count
6,indiko Plus Fully Automated Chemistry Analyze...,194.0,Cholesterol


In [22]:
# #FINAL PIPELINE (THIS IS THE RESULT)
# final_results = []

# for _, row in structured_df.iterrows():
#     test = row["test_name"]
#     value = row["value"]
#     status = classify_status(test, value)
#     knowledge = retrieve_knowledge(test)

#     explanation = generate_explanation(test, value, status, knowledge)

#     final_results.append({
#         "test_name": test,
#         "value": value,
#         "status": status,
#         "explanation": explanation
#     })

# final_df = pd.DataFrame(final_results)
# final_df



In [23]:
# ----------------------------------
# FINAL PIPELINE (CORRECTED)
# ----------------------------------

final_results = []

for _, row in structured_df.iterrows():
    test = row["test_name"]
    value = row["value"]

    # Step 1: classify result
    status = classify_status(test, value)

    # Step 2: get base medical knowledge
    knowledge = retrieve_knowledge(test)

    # Step 3: generate explanation in English
    english_explanation = generate_explanation(
        test,
        value,
        status,
        knowledge
    )

    # Step 4: translate if Sinhala selected
    if user_language == "si":
        final_explanation = translate_to_sinhala(english_explanation)
    else:
        final_explanation = english_explanation

    # Step 5: store final result
    final_results.append({
        "test_name": test,
        "value": value,
        "status": status,
        "language": user_language,
        "explanation": final_explanation
    })

#     # Explain WHY the result is Low / Normal / High
# xai_reason = explain_why(test, value, status)

# final_results.append({
#     "test_name": test,
#     "value": value,
#     "status": status,
#     "language": user_language,
#     "why_status": xai_reason,          # <-- XAI explanation
#     "explanation": final_explanation         # <-- Gemini / fallback explanation
# })


# Convert to DataFrame
final_df = pd.DataFrame(final_results)

final_df




Unnamed: 0,test_name,value,status,language,explanation
0,RBC,437.0,Unknown,si,මෙන්න ඔබේ ලේ පරීක්ෂණයේ ප්‍රතිඵලය පිළිබඳ ඉතා සර...
1,MCV,92.0,Normal,si,**රතු රුධිරාණු** කියන්නේ පොඩි ට්‍රක් රථ වගේ. ම...
2,MCH,287.0,High,si,මෙන්න ඔබේ රුධිර පරීක්ෂණයේ ප්‍රතිඵල සරලව පැහැදි...
3,MCH,322.0,High,si,මෙම රුධිර පරීක්ෂණය මගින් ඔබේ **රතු රුධිරාණු** ...
4,RDW,8.0,Low,si,මෙන්න ඔබේ රුධිර පරීක්ෂණයේ ප්‍රතිඵල සරලව පැහැදි...
5,Platelet Count,227.0,Normal,si,This blood test is checking on your **platelet...
6,Cholesterol,194.0,Normal,si,\nTest: Cholesterol\nValue: 194.0\nStatus: Nor...


In [24]:
final_df.to_csv(
    "/content/drive/MyDrive/MedGen.AI Datasets/FINALIZED DATASETS/OCR/extracted_report_results.csv",
    index=False
)


In [25]:
# ---------------------------------------------
# Extract patient location from OCR text
# ---------------------------------------------
def extract_location_from_ocr(ocr_text):
    """
    Extracts city and hospital name from OCR text
    Rule-based (Explainable & FYP-safe)
    """

    # Convert text to uppercase for matching
    text_upper = ocr_text.upper()

    # -----------------------------
    # Known Sri Lankan cities
    # -----------------------------
    KNOWN_CITIES = [
        # Southern Province
    "MATARA",
    "GALLE",
    "HAMBANTOTA",
    "TANGALLE",

    # Western Province
    "COLOMBO",
    "NEGOMBO",
    "KALUTARA",
    "GAMPAHA",

    # Central Province
    "KANDY",
    "NUWARA ELIYA",
    "MATALE",

    # Northern Province
    "JAFFNA",
    "VAVUNIYA",
    "KILINOCHCHI",
    "MANNAR",

    # Eastern Province
    "TRINCOMALEE",
    "BATTICALOA",
    "AMPARA",

    # North Western Province
    "KURUNEGALA",
    "PUTTALAM",

    # North Central Province
    "ANURADHAPURA",
    "POLONNARUWA",

    # Uva Province
    "BADULLA",
    "MONARAGALA",

    # Sabaragamuwa Province
    "RATNAPURA",
    "KEGALLE"
    ]

    # -----------------------------
    # Hospital keywords
    # -----------------------------
    HOSPITAL_KEYWORDS = [
        "HOSPITAL",
        "LABORATORY",
        "MEDICAL",
        "LAB",
        "CLINIC"
    ]

    detected_city = None
    detected_hospital = None

    # -----------------------------
    # Detect city
    # -----------------------------
    for city in KNOWN_CITIES:
        if city in text_upper:
            detected_city = city.title()
            break

    # -----------------------------
    # Detect hospital name
    # -----------------------------
    for line in ocr_text.split("\n"):
        if any(keyword in line.upper() for keyword in HOSPITAL_KEYWORDS):
            detected_hospital = line.strip()
            break

    return {
        "city": detected_city,
        "hospital": detected_hospital
    }


In [26]:
# Extract location metadata
location_info = extract_location_from_ocr(extracted_text)

location_info

{'city': 'Matara', 'hospital': 'Medical Laboratory Technologist'}

In [27]:
import json

# Save patient metadata
metadata_path = "/content/drive/MyDrive/MedGen.AI Datasets/FINALIZED DATASETS/OCR/patient_metadata.json"

with open(metadata_path, "w") as f:
    json.dump(location_info, f, indent=4)

print("Patient location metadata saved.")


Patient location metadata saved.


In [28]:
# ---------------------------------------------
# SAVE OCR FULL TEXT (IMPORTANT FOR MODULE REUSE)
# ---------------------------------------------
from google.colab import drive
drive.mount("/content/drive")

ocr_text_path = (
    "/content/drive/MyDrive/MedGen.AI Datasets/"
    "FINALIZED DATASETS/OCR/ocr_full_text.txt"
)

with open(ocr_text_path, "w", encoding="utf-8") as f:
    f.write(extracted_text)

print("OCR full text saved successfully.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
OCR full text saved successfully.
