In [11]:
import re
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pytesseract
from typing import List, Dict, Any, Optional, Tuple
from IPython.display import Markdown, display
import pandas as pd
from dataclasses import dataclass

@dataclass
class LabTest:
    test_name: str
    value: str
    unit: Optional[str] = None
    bio_reference_range: Optional[str] = None
    lab_test_out_of_range: Optional[bool] = None
    
    def to_dict(self):
        return {
            'test_name': self.test_name,
            'value': self.value,
            'unit': self.unit,
            'bio_reference_range': self.bio_reference_range,
            'lab_test_out_of_range': self.lab_test_out_of_range
        }

def preprocess_image(image_path, display_steps=False):
    """
    Preprocess image to improve OCR accuracy
    """
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not read image {image_path}")
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised = cv2.fastNlMeansDenoising(gray, None, h=10, 
                                       templateWindowSize=7, 
                                       searchWindowSize=21)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(denoised)
    _, thresh = cv2.threshold(enhanced, 0, 255, 
                             cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    if display_steps:
        plt.figure(figsize=(15, 5))
        steps = [gray, denoised, enhanced, thresh]
        titles = ['Grayscale', 'Denoised', 'Enhanced', 'Thresholded']
        for i, (step, title) in enumerate(zip(steps, titles)):
            plt.subplot(1, 4, i+1)
            plt.imshow(step, cmap='gray')
            plt.title(title)
            plt.axis('off')
        plt.show()
    return thresh

def extract_text(image_path, config='--psm 6 --oem 3'):
    """
    Extract text from an image using OCR
    """
    preprocessed = preprocess_image(image_path)
    text = pytesseract.image_to_string(preprocessed, config=config)
    return text.strip()

def parse_reference_range(range_text: str) -> str:
    if not range_text:
        return None
    range_text = range_text.strip()
    return range_text

def is_value_out_of_range(value: str, range_text: str) -> bool:
    if not range_text:
        return False
    try:
        numeric_value = re.search(r'([\d.]+)', value)
        if not numeric_value:
            return False
        numeric_value = float(numeric_value.group(1))
        range_match = re.search(r'([\d.]+)[\s-]+to[\s-]+([\d.]+)|(\d+\.?\d*)[\s-]-[\s-](\d+\.?\d*)', range_text)
        if range_match:
            if range_match.group(1) and range_match.group(2):
                lower = float(range_match.group(1))
                upper = float(range_match.group(2))
            else:
                lower = float(range_match.group(3))
                upper = float(range_match.group(4))
            return numeric_value < lower or numeric_value > upper
        less_than_match = re.search(r'<\s*([\d.]+)', range_text)
        if less_than_match:
            upper = float(less_than_match.group(1))
            return numeric_value >= upper
        greater_than_match = re.search(r'>\s*([\d.]+)', range_text)
        if greater_than_match:
            lower = float(greater_than_match.group(1))
            return numeric_value <= lower
        return False
    except Exception:
        return False

def extract_unit(value_text: str) -> Tuple[str, Optional[str]]:
    unit_patterns = [
        r'(\d+\.?\d*)\s*(\w+/%|/cumm|/mm3|mill/cumm|mill/mm3|g/dL|gm/dL|gm%|mg/L|mg/dL|pg|pgm|fL|%|U/L|mIU/L)',
        r'(\d+\.?\d*)\s*\[?[HL]?\]?\s*(\w+/%|/cumm|/mm3|mill/cumm|mill/mm3|g/dL|gm/dL|gm%|mg/L|mg/dL|pg|pgm|fL|%|U/L|mIU/L)'
    ]
    for pattern in unit_patterns:
        match = re.search(pattern, value_text)
        if match:
            return match.group(1), match.group(2)
    value_only = re.search(r'(\d+\.?\d*)\s*\[?[HL]?\]?', value_text)
    if value_only:
        return value_only.group(0), None
    return value_text, None

def parse_lab_report(text: str) -> List[LabTest]:
    lab_tests = []
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    for i, line in enumerate(lines):
        if any(x in line.lower() for x in ['report', 'end of', 'laboratory', 'patient', 'doctor', 'sample']):
            continue
        match = re.search(r'^([\w\s\(\)-]+):\s*([\d\.\s]+(?:\[?[HL]\]?)?\s*(?:[\w/%]+)?)', line)
        if not match and ':' in line:
            parts = line.split(':')
            if len(parts) >= 2:
                test_name = parts[0].strip()
                value_unit = parts[1].strip()
                ref_range = None
                if i + 1 < len(lines) and re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*|<\s*\d+\.?\d*|>\s*\d+\.?\d*', lines[i+1]):
                    ref_range = lines[i+1].strip()
                elif re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*|<\s*\d+\.?\d*|>\s*\d+\.?\d*', value_unit):
                    range_match = re.search(r'(\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*|<\s*\d+\.?\d*|>\s*\d+\.?\d*)', value_unit)
                    if range_match:
                        ref_range = range_match.group(0)
                        value_unit = value_unit.replace(ref_range, '').strip()
                value, unit = extract_unit(value_unit)
                if test_name and value:
                    lab_test = LabTest(
                        test_name=test_name,
                        value=value,
                        unit=unit,
                        bio_reference_range=ref_range,
                        lab_test_out_of_range=is_value_out_of_range(value, ref_range) if ref_range else None
                    )
                    lab_tests.append(lab_test)
                continue
        pattern = r'([\w\s\(\)-]+)\s+([\d\.]+\s*\[?[HL]?\]?)\s*([\w/%]+)?\s*([\d\.-]+\s*[\w/%]+)?'
        match = re.search(pattern, line)
        if match:
            test_name = match.group(1).strip()
            value = match.group(2).strip()
            unit = match.group(3) if match.group(3) else None
            ref_range = match.group(4) if match.group(4) else None
            if not ref_range and i + 1 < len(lines) and re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*', lines[i+1]):
                ref_range = lines[i+1].strip()
            lab_test = LabTest(
                test_name=test_name,
                value=value,
                unit=unit,
                bio_reference_range=ref_range,
                lab_test_out_of_range=is_value_out_of_range(value, ref_range) if ref_range else None
            )
            lab_tests.append(lab_test)
            continue
        if i + 1 < len(lines):
            if re.search(r'[\d\.]+', lines[i+1]) and not re.search(r'[\d\.]+', line):
                test_name = line.strip()
                next_line = lines[i+1].strip()
                value_match = re.search(r'([\d\.]+\s*\[?[HL]?\]?)\s*([\w/%]+)?', next_line)
                if value_match:
                    value = value_match.group(1).strip()
                    unit = value_match.group(2) if value_match.group(2) else None
                    ref_range = None
                    range_match = re.search(r'(\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*)', next_line)
                    if range_match:
                        ref_range = range_match.group(0)
                    elif i + 2 < len(lines) and re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*', lines[i+2]):
                        ref_range = lines[i+2].strip()
                    lab_test = LabTest(
                        test_name=test_name,
                        value=value,
                        unit=unit,
                        bio_reference_range=ref_range,
                        lab_test_out_of_range=is_value_out_of_range(value, ref_range) if ref_range else None
                    )
                    lab_tests.append(lab_test)
    process_cbc_format(lines, lab_tests)
    return lab_tests

def process_cbc_format(lines: List[str], lab_tests: List[LabTest]):
    cbc_test_patterns = {
        r'h[ae]moglobin': 'Hemoglobin',
        r'rbc\s*count': 'RBC Count',
        r'wbc\s*count': 'WBC Count',
        r'platelet\s*count': 'Platelet Count',
        r'packed\s*cell\s*volume|hct|pcv': 'PCV',
        r'mcv': 'MCV',
        r'mch': 'MCH',
        r'mchc': 'MCHC',
        r'rdw': 'RDW',
        r'neutrophils': 'Neutrophils',
        r'lymphocytes': 'Lymphocytes',
        r'monocytes': 'Monocytes',
        r'eosinophils': 'Eosinophils',
        r'basophils': 'Basophils',
        r'c-reactive\s*protein|crp': 'CRP'
    }
    for i, line in enumerate(lines):
        for pattern, std_name in cbc_test_patterns.items():
            if re.search(pattern, line.lower()):
                value_match = re.search(r'([\d\.]+\s*\[?[HL]?\]?)', line)
                if value_match:
                    value = value_match.group(1).strip()
                    unit_match = re.search(r'([\d\.]+\s*\[?[HL]?\]?)\s*([\w/%]+)', line)
                    unit = unit_match.group(2) if unit_match else None
                    ref_range = None
                    range_match = re.search(r'(\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*)', line)
                    if range_match:
                        ref_range = range_match.group(1)
                    elif i + 1 < len(lines) and re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*', lines[i+1]):
                        ref_range = re.search(r'(\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*)', lines[i+1]).group(1)
                    lab_test = LabTest(
                        test_name=std_name,
                        value=value,
                        unit=unit,
                        bio_reference_range=ref_range,
                        lab_test_out_of_range=is_value_out_of_range(value, ref_range) if ref_range else None
                    )
                    if not any(test.test_name == std_name for test in lab_tests):
                        lab_tests.append(lab_test)

def process_image(image_path, display_preprocessing=False, display_extracted_text=True):
    """
    Process a single lab report image, extract text and parse lab tests
    """
    extracted_text = extract_text(image_path)
    if display_extracted_text:
        display(Markdown(f"*Extracted Text from {os.path.basename(image_path)}:*\n\n\n{extracted_text}\n"))
    lab_tests = parse_lab_report(extracted_text)
    df = pd.DataFrame([test.to_dict() for test in lab_tests])
    return df

def process_directory(directory_path, file_extension='.png'):
    """
    Process all images in a directory
    """
    all_results = []
    image_files = [os.path.join(directory_path, f) 
                   for f in os.listdir(directory_path) 
                   if f.endswith(file_extension)]
    for image_path in image_files:
        print(f"Processing {os.path.basename(image_path)}...")
        df = process_image(image_path, display_preprocessing=False, display_extracted_text=False)
        df['source_file'] = os.path.basename(image_path)
        all_results.append(df)
    if all_results:
        return pd.concat(all_results, ignore_index=True)
    else:
        return pd.DataFrame()

def create_api_json_response(lab_tests):
    """
    Create API JSON response format as required
    """
    return {
        "is_success": True,
        "lab_tests": [test.to_dict() for test in lab_tests]
    }

def run_example(image_path):
    print(f"Processing {os.path.basename(image_path)}...")
    df = process_image(image_path, display_preprocessing=True)
    display(df)
    lab_tests = [LabTest(**row) for _, row in df.iterrows()]
    api_response = create_api_json_response(lab_tests)
    display(Markdown("*API Response (JSON):*"))
    display(api_response)
    return df, api_response

sample_img = r"C:\Users\kusha\Downloads\bajaj 123\lab_reports_samples\lbmaske\BLR-0425-PA-0042585_F-RARI_S_MANIKANDAN_1_250427_1609@G.pdf_page_37.png"
results_df, api_json = run_example(sample_img)


Processing BLR-0425-PA-0042585_F-RARI_S_MANIKANDAN_1_250427_1609@G.pdf_page_37.png...


*Extracted Text from BLR-0425-PA-0042585_F-RARI_S_MANIKANDAN_1_250427_1609@G.pdf_page_37.png:*


CS
ie Tt EN
SIS
HITT ~
£7 <\N
rarer yl
KIMS HEALTH LABORATORY MEDICINE
Patient i | ee
|| = | eC OC
ee ee es ee ee
“Sample Type Test a es
RDW (sheath flow DC 13.7 % 11.60-13.70 25-Apr-2025 09:48 PM
detection)
-Platelet Count (sheath flow 299 150-400 26-Apr-2025 03:33 PM
DC detection / thousand/cu.mm
Optical)
MPV (sheath flow DC 9.2 fL 7.80-11 25-Apr-2025 09:48 PM
detection)
Serum Sample Collected at :25-Apr-2025 09:11 PM Received at: 25-Apr-2025 09:29 PM
*** Glucose Random (Hexokinase 99 mg/dL Normal range 70 25-Apr-2025 10:01 PM
method) -140mg/dl
Diabetes mellitus : >
200mg/dl
Beta HCG (ECLIA) <0.2 mlU/ml Reference Ranges: 26-Apr-2025 03:29 PM
Non pregnant :0.1-
5.7 mlU/ml
Pregnant :
1st week 210-30
mlU/mll
2nd week =: 30-
100 mlU/ml
3rd week : 100-
1000 mlU/ml
4th week : 1000-
10,000 mlU/ml
2-3 months :
30,000-1,00,000
mlU/ml
2nd trimester :
10,000-30,000
mlU/ml
3rd trimester :
5000-15,000 mlU/ml
CRP (Immunoturbidimetric 14.5 mg/L 0-5 25-Apr-2025 10:05 PM
method)
This test helps diagnose infections or diseases that cause inflammation, cardiovascular disease, inflammatory
bowel disease, pelvic inflammatory disease, arthritis and lupus. The C-Reactive Protein is a protein produced
by the liver. When levels of CRP are higher than normal, this indicates infection or inflammation in the body.
Although this test will tell you if an infection is present, it does not specify the source of the infection
ma
All Investigations have their limitations which are imposed by sensitivity and specificity of individual procedures as well as the quality of specimen
received by the lab. Investigations are carried out in fully automated equipment from global leaders. Daily internal quality checks are carried out which are
turher vacated by <tr I


Unnamed: 0,test_name,value,unit,bio_reference_range,lab_test_out_of_range
0,HITT ~,7,,,
1,RDW (sheath flow DC 13.7 % 11.60-13.70 25-Apr-...,48,,,
2,detection),299,150,150-400,False
3,-Platelet Count (sheath flow 299 150-400 26-Ap...,03,,,
4,Optical),9.2,fL,7.80-11,False
5,MPV (sheath flow DC 9.2 fL 7.80-11 25-Apr-2025 09,48,,,
6,detection),25,,,
7,*** Glucose Random (Hexokinase 99 mg/dL Normal...,01,,,
8,Diabetes mellitus : >,200,mg/dl,,
9,Beta HCG (ECLIA) <0.2 mlU/ml Reference Ranges,26,,,


*API Response (JSON):*

{'is_success': True,
 'lab_tests': [{'test_name': 'HITT ~',
   'value': '7',
   'unit': None,
   'bio_reference_range': None,
   'lab_test_out_of_range': None},
  {'test_name': 'RDW (sheath flow DC 13.7 % 11.60-13.70 25-Apr-2025 09',
   'value': '48 ',
   'unit': None,
   'bio_reference_range': None,
   'lab_test_out_of_range': None},
  {'test_name': 'detection)',
   'value': '299',
   'unit': '150',
   'bio_reference_range': '150-400',
   'lab_test_out_of_range': False},
  {'test_name': '-Platelet Count (sheath flow 299 150-400 26-Apr-2025',
   'value': '03',
   'unit': None,
   'bio_reference_range': None,
   'lab_test_out_of_range': None},
  {'test_name': 'Optical)',
   'value': '9.2',
   'unit': 'fL',
   'bio_reference_range': '7.80-11',
   'lab_test_out_of_range': False},
  {'test_name': 'MPV (sheath flow DC 9.2 fL 7.80-11 25-Apr-2025 09',
   'value': '48 ',
   'unit': None,
   'bio_reference_range': None,
   'lab_test_out_of_range': None},
  {'test_name': 'detection)',
   'value'

In [2]:
pip install pandas


Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting tzdata>=2022.7 (from pandas)
  Obtaining dependency information for tzdata>=2022.7 from https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl.metadata
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.2.3 tzdata-2025.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
