In [18]:
import re
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pytesseract
from typing import List, Dict, Any, Optional, Tuple
from IPython.display import Markdown, display
import pandas as pd
from dataclasses import dataclass

# Define a data class for lab tests
@dataclass
class LabTest:
    test_name: str
    value: str
    unit: Optional[str] = None
    bio_reference_range: Optional[str] = None
    lab_test_out_of_range: Optional[bool] = None
    
    def to_dict(self):
        return {
            'test_name': self.test_name,
            'value': self.value,
            'unit': self.unit,
            'bio_reference_range': self.bio_reference_range,
            'lab_test_out_of_range': self.lab_test_out_of_range
        }

def preprocess_image(image_path, display_steps=False):
    """
    Preprocess image to improve OCR accuracy
    
    Args:
        image_path: Path to the image file
        display_steps: Whether to display preprocessing steps
        
    Returns:
        Preprocessed image
    """
    # Read image
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not read image {image_path}")
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Noise reduction
    denoised = cv2.fastNlMeansDenoising(gray, None, h=10, 
                                       templateWindowSize=7, 
                                       searchWindowSize=21)
    # Contrast enhancement
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(denoised)
    
    # Thresholding
    _, thresh = cv2.threshold(enhanced, 0, 255, 
                             cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    if display_steps:
        plt.figure(figsize=(15, 5))
        steps = [gray, denoised, enhanced, thresh]
        titles = ['Grayscale', 'Denoised', 'Enhanced', 'Thresholded']
        
        for i, (step, title) in enumerate(zip(steps, titles)):
            plt.subplot(1, 4, i+1)
            plt.imshow(step, cmap='gray')
            plt.title(title)
            plt.axis('off')
        plt.show()
    
    return thresh

def extract_text(image_path, config='--psm 6 --oem 3'):
    """
    Extract text from an image using OCR
    
    Args:
        image_path: Path to the image file
        config: Tesseract configuration string
        
    Returns:
        Extracted text
    """
    preprocessed = preprocess_image(image_path)
    text = pytesseract.image_to_string(preprocessed, config=config)
    return text.strip()

def parse_reference_range(range_text: str) -> str:
    """Parse and standardize reference range text"""
    if not range_text:
        return None
    
    # Clean the range text
    range_text = range_text.strip()
    
    # Return the cleaned range
    return range_text

def is_value_out_of_range(value: str, range_text: str) -> bool:
    """Determine if the test value is outside the reference range"""
    if not range_text:
        return False
    
    try:
        # Extract numeric value, handling cases with [H] or [L] indicators
        numeric_value = re.search(r'([\d.]+)', value)
        if not numeric_value:
            return False
        
        numeric_value = float(numeric_value.group(1))
        
        # Common range patterns
        # Pattern 1: "0-5" or "13.0-17.0"
        range_match = re.search(r'([\d.]+)[\s-]+to[\s-]+([\d.]+)|(\d+\.?\d*)[\s-]*-[\s-]*(\d+\.?\d*)', range_text)
        if range_match:
            if range_match.group(1) and range_match.group(2):  # "x to y" format
                lower = float(range_match.group(1))
                upper = float(range_match.group(2))
            else:  # "x-y" format
                lower = float(range_match.group(3))
                upper = float(range_match.group(4))
            
            return numeric_value < lower or numeric_value > upper
            
        # Pattern 2: "< 5" or "> 10"
        less_than_match = re.search(r'<\s*([\d.]+)', range_text)
        if less_than_match:
            upper = float(less_than_match.group(1))
            return numeric_value >= upper
            
        greater_than_match = re.search(r'>\s*([\d.]+)', range_text)
        if greater_than_match:
            lower = float(greater_than_match.group(1))
            return numeric_value <= lower
        
        return False
    except Exception:
        # If any parsing fails, default to False
        return False

def extract_unit(value_text: str) -> Tuple[str, Optional[str]]:
    """Extract value and unit from combined text"""
    # Common patterns for units
    unit_patterns = [
        r'(\d+\.?\d*)\s*(\w+/%|/cumm|/mm3|mill/cumm|mill/mm3|g/dL|gm/dL|gm%|mg/L|mg/dL|pg|pgm|fL|%|U/L|mIU/L)',
        r'(\d+\.?\d*)\s*\[?[HL]?\]?\s*(\w+/%|/cumm|/mm3|mill/cumm|mill/mm3|g/dL|gm/dL|gm%|mg/L|mg/dL|pg|pgm|fL|%|U/L|mIU/L)'
    ]
    
    for pattern in unit_patterns:
        match = re.search(pattern, value_text)
        if match:
            return match.group(1), match.group(2)
    
    # If no unit found, return the value as is
    value_only = re.search(r'(\d+\.?\d*)\s*\[?[HL]?\]?', value_text)
    if value_only:
        return value_only.group(0), None
    
    return value_text, None

def parse_lab_report(text: str) -> List[LabTest]:
    """Parse the extracted text to identify lab tests, values, and reference ranges"""
    lab_tests = []
    
    # Split text into lines and clean them
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Pattern matching for different report formats
    for i, line in enumerate(lines):
        # Skip header and footer lines
        if any(x in line.lower() for x in ['report', 'end of', 'laboratory', 'patient', 'doctor', 'sample']):
            continue
            
        # Look for lines with test names and values using different patterns
        
        # Pattern 1: Test name followed by value on same line (with possible unit)
        # Example: "Hemoglobin: 11.6 gm/dl"
        match = re.search(r'^([\w\s\(\)-]+):\s*([\d\.\s]+(?:\[?[HL]\]?)?\s*(?:[\w/%]+)?)', line)
        if not match and ':' in line:
            parts = line.split(':')
            if len(parts) >= 2:
                test_name = parts[0].strip()
                value_unit = parts[1].strip()
                
                # Look for reference range in the next line or same line
                ref_range = None
                if i + 1 < len(lines) and re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*|<\s*\d+\.?\d*|>\s*\d+\.?\d*', lines[i+1]):
                    ref_range = lines[i+1].strip()
                elif re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*|<\s*\d+\.?\d*|>\s*\d+\.?\d*', value_unit):
                    # Reference range might be in the same line
                    range_match = re.search(r'(\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*|<\s*\d+\.?\d*|>\s*\d+\.?\d*)', value_unit)
                    if range_match:
                        ref_range = range_match.group(0)
                        value_unit = value_unit.replace(ref_range, '').strip()
                
                value, unit = extract_unit(value_unit)
                
                if test_name and value and ref_range:  # Only add if reference range exists
                    lab_test = LabTest(
                        test_name=test_name,
                        value=value,
                        unit=unit,
                        bio_reference_range=ref_range,
                        lab_test_out_of_range=is_value_out_of_range(value, ref_range)
                    )
                    lab_tests.append(lab_test)
                continue
        
        # Pattern 2: Test name and value separated by whitespace with optional reference range
        # Example: "Hemoglobin 11.6 gm/dl 13.0-17.0"
        pattern = r'([\w\s\(\)-]+)\s+([\d\.]+\s*\[?[HL]?\]?)\s*([\w/%]+)?\s*([\d\.-]+\s*[\w/%]+)?'
        match = re.search(pattern, line)
        if match:
            test_name = match.group(1).strip()
            value = match.group(2).strip()
            unit = match.group(3) if match.group(3) else None
            ref_range = match.group(4) if match.group(4) else None
            
            # Look for reference range in the next line if not found
            if not ref_range and i + 1 < len(lines) and re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*', lines[i+1]):
                ref_range = lines[i+1].strip()
            
            if ref_range:  # Only add if reference range exists
                lab_test = LabTest(
                    test_name=test_name,
                    value=value,
                    unit=unit,
                    bio_reference_range=ref_range,
                    lab_test_out_of_range=is_value_out_of_range(value, ref_range)
                )
                lab_tests.append(lab_test)
            continue
        
        # Pattern 3: Test name on one line, value and reference range on next
        if i + 1 < len(lines):
            # Check if current line might be a test name and next line has numeric values
            if re.search(r'[\d\.]+', lines[i+1]) and not re.search(r'[\d\.]+', line):
                test_name = line.strip()
                next_line = lines[i+1].strip()
                
                # Extract value, unit and reference range from the next line
                value_match = re.search(r'([\d\.]+\s*\[?[HL]?\]?)\s*([\w/%]+)?', next_line)
                if value_match:
                    value = value_match.group(1).strip()
                    unit = value_match.group(2) if value_match.group(2) else None
                    
                    # Look for reference range in the same line or next line
                    ref_range = None
                    range_match = re.search(r'(\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*)', next_line)
                    if range_match:
                        ref_range = range_match.group(0)
                    elif i + 2 < len(lines) and re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*', lines[i+2]):
                        ref_range = lines[i+2].strip()
                    
                    if ref_range:  # Only add if reference range exists
                        lab_test = LabTest(
                            test_name=test_name,
                            value=value,
                            unit=unit,
                            bio_reference_range=ref_range,
                            lab_test_out_of_range=is_value_out_of_range(value, ref_range)
                        )
                        lab_tests.append(lab_test)
    
    # Process multi-column format common in CBC reports
    process_cbc_format(lines, lab_tests)
    
    # Filter out tests without reference ranges
    lab_tests = [test for test in lab_tests if test.bio_reference_range is not None]
    
    return lab_tests

def process_cbc_format(lines: List[str], lab_tests: List[LabTest]):
    """Process CBC format which typically has test name and values in a tabular format"""
    # Map common CBC test patterns to standardized names
    cbc_test_patterns = {
        r'h[ae]moglobin': 'Hemoglobin',
        r'rbc\s*count': 'RBC Count',
        r'wbc\s*count': 'WBC Count',
        r'platelet\s*count': 'Platelet Count',
        r'packed\s*cell\s*volume|hct|pcv': 'PCV',
        r'mcv': 'MCV',
        r'mch': 'MCH',
        r'mchc': 'MCHC',
        r'rdw': 'RDW',
        r'neutrophils': 'Neutrophils',
        r'lymphocytes': 'Lymphocytes',
        r'monocytes': 'Monocytes',
        r'eosinophils': 'Eosinophils',
        r'basophils': 'Basophils',
        r'c-reactive\s*protein|crp': 'CRP'
    }
    
    for i, line in enumerate(lines):
        for pattern, std_name in cbc_test_patterns.items():
            if re.search(pattern, line.lower()):
                # Check if the line contains a value
                value_match = re.search(r'([\d\.]+\s*\[?[HL]?\]?)', line)
                
                if value_match:
                    value = value_match.group(1).strip()
                    # Extract unit if present
                    unit_match = re.search(r'([\d\.]+\s*\[?[HL]?\]?)\s*([\w/%]+)', line)
                    unit = unit_match.group(2) if unit_match else None
                    
                    # Look for reference range
                    ref_range = None
                    range_match = re.search(r'(\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*)', line)
                    if range_match:
                        ref_range = range_match.group(1)
                    elif i + 1 < len(lines) and re.search(r'\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*', lines[i+1]):
                        ref_range = re.search(r'(\d+\.?\d*\s*-\s*\d+\.?\d*|\d+\.?\d*\s*to\s*\d+\.?\d*)', lines[i+1]).group(1)
                    
                    # Create lab test entry only if reference range exists
                    if ref_range:
                        lab_test = LabTest(
                            test_name=std_name,
                            value=value,
                            unit=unit,
                            bio_reference_range=ref_range,
                            lab_test_out_of_range=is_value_out_of_range(value, ref_range)
                        )
                        
                        # Check if this test is already in the list to avoid duplicates
                        if not any(test.test_name == std_name for test in lab_tests):
                            lab_tests.append(lab_test)

def process_image(image_path, display_preprocessing=False, display_extracted_text=True):
    """
    Process a single lab report image, extract text and parse lab tests
    
    Args:
        image_path: Path to the image file
        display_preprocessing: Whether to display preprocessing steps
        display_extracted_text: Whether to display extracted text
        
    Returns:
        DataFrame with extracted lab tests
    """
    # Extract text from image
    extracted_text = extract_text(image_path)
    
    # Display extracted text if requested
    if display_extracted_text:
        display(Markdown(f"**Extracted Text from {os.path.basename(image_path)}:**\n\n```\n{extracted_text}\n```"))
    
    # Parse lab tests from extracted text
    lab_tests = parse_lab_report(extracted_text)
    
    # Convert to DataFrame for easier analysis
    df = pd.DataFrame([test.to_dict() for test in lab_tests])
    
    return df

def process_directory(directory_path, file_extension='.png'):
    """
    Process all images in a directory
    
    Args:
        directory_path: Path to directory containing lab report images
        file_extension: File extension to filter images
        
    Returns:
        DataFrame with extracted lab tests from all images
    """
    all_results = []
    
    # Get all image files in directory
    image_files = [os.path.join(directory_path, f) 
                   for f in os.listdir(directory_path) 
                   if f.endswith(file_extension)]
    
    # Process each image
    for image_path in image_files:
        print(f"Processing {os.path.basename(image_path)}...")
        df = process_image(image_path, display_preprocessing=False, display_extracted_text=False)
        
        # Add filename column
        df['source_file'] = os.path.basename(image_path)
        
        all_results.append(df)
    
    # Combine results
    if all_results:
        return pd.concat(all_results, ignore_index=True)
    else:
        return pd.DataFrame()

def create_api_json_response(lab_tests):
    """
    Create API JSON response format as required
    
    Args:
        lab_tests: List of LabTest objects
        
    Returns:
        Dictionary in the required API response format
    """
    return {
        "is_success": True,
        "lab_tests": [test.to_dict() for test in lab_tests]
    }

# Example usage

# Process a single image
def run_example(image_path):
    print(f"Processing {os.path.basename(image_path)}...")
    df = process_image(image_path, display_preprocessing=True)
    
    # Display results
    display(df)
    
    # Show API-style JSON response
    lab_tests = [LabTest(**row) for _, row in df.iterrows()]
    api_response = create_api_json_response(lab_tests)
    display(Markdown("**API Response (JSON):**"))
    display(api_response)
    
    return df, api_response

# Demo code - uncomment and use as needed
sample_img = r"C:\Users\gdk14\Desktop\bajaj\lbmaske\GUR-0425-PA-0052918_Q-SONU104890001merged2_250427_1622@F.pdf_page_37.png"
results_df, api_json = run_example(sample_img)

Processing GUR-0425-PA-0052918_Q-SONU104890001merged2_250427_1622@F.pdf_page_37.png...


**Extracted Text from GUR-0425-PA-0052918_Q-SONU104890001merged2_250427_1622@F.pdf_page_37.png:**

```
ae Property ID Wo. 26, Ward Ne. 28, Mabavir Park,
yt Wy y Opp. Metro Pilar Ne. £81, Rohtak Raed,
DR mIIANJ AY (69) Behadurgerh, Distt. Jhajjar, + 124807
~anewaum HOSPITAL © Oe
CW he, :UaassooLzezeeTc209se3 NABH > me aiLcom
(SO 9081 : 2015 CERTIFIED @ www.dreanjaytespiial.com
None? Mr. som Age/Sex SS veasv7m
Ref.By : Or. Chanchal Verma OPD TPD : IPD
oct: Lab No 112
Kidney Functions Test
Investigation ___Result___Units__ Normal Range
Blood Urea 30 mg/dl 15-45
S.Creatinine 09 mg/dl 0.5-1.4
S. Uric Acid 6.0 mg/dl M-3.6-7,0
F -2.4-5.7
S. Proteins 79 gm/dl 6-8.4
S.Albumin 3.8 gmidl 3.35.5
Globulin 31 gmidl 1.5-3.0
Liver Function Test
Investigation Result Units Normal Range
S. Bilirubin: Total 07 mg/dl 0.2-1.2
Direct 0.2 mg/dl 0.0-0.25
Indirect 0.5 me/dl 0.0-0.75
SGOT 54 UA, 5-46
SGPT 30 U/L 5-49
Alkaline Phosphatase 298 IU/L 80-300
Up to 750 In children
S. Proteins 79 gnv/dl 6-8.4
S.Albumin 3.8 gm/dl 3.5-3.5
Globulin 3.1 gm/d) 1.5-3.0
BIOCHEMISTRY
Investigations Result Units Nermal Range
S. Sodium 137.1 meq/l 135-155
S. Potassium 3.62 meq/l 3.5-5.5
S. Calcium 8.23 me/dl 8.4-10.4
/ . S. MW.
7 ot eat ye. 326!) |
a
```

Unnamed: 0,test_name,value,unit,bio_reference_range,lab_test_out_of_range
0,Investigation ___Result___Units__ Normal Range,30,mg/dl,15-45,False
1,Blood Urea,30,mg/dl,15-45,False
2,Creatinine,09,mg/dl,0.5-1.4,True
3,Uric Acid,6.0,mg/dl,F -2.4-5.7,True
4,Proteins,79,gm/dl,6-8.4,True
5,Albumin,3.8,gmidl,3.35.5,False
6,Globulin 31 gmidl,1.5,,-3.0,False
7,Investigation Result Units Normal Range,.,Bilirubin,0.2-1.2,False
8,S. Bilirubin,07,,Direct 0.2 mg/dl 0.0-0.25,True
9,Direct,0.2,mg/dl,0.0-0.25,False


**API Response (JSON):**

{'is_success': True,
 'lab_tests': [{'test_name': 'Investigation ___Result___Units__ Normal Range',
   'value': '30',
   'unit': 'mg/dl',
   'bio_reference_range': '15-45',
   'lab_test_out_of_range': False},
  {'test_name': 'Blood Urea',
   'value': '30',
   'unit': 'mg/dl',
   'bio_reference_range': '15-45',
   'lab_test_out_of_range': False},
  {'test_name': 'Creatinine',
   'value': '09',
   'unit': 'mg/dl',
   'bio_reference_range': '0.5-1.4',
   'lab_test_out_of_range': True},
  {'test_name': 'Uric Acid',
   'value': '6.0',
   'unit': 'mg/dl',
   'bio_reference_range': 'F -2.4-5.7',
   'lab_test_out_of_range': True},
  {'test_name': 'Proteins',
   'value': '79',
   'unit': 'gm/dl',
   'bio_reference_range': '6-8.4',
   'lab_test_out_of_range': True},
  {'test_name': 'Albumin',
   'value': '3.8',
   'unit': 'gmidl',
   'bio_reference_range': '3.35.5',
   'lab_test_out_of_range': False},
  {'test_name': 'Globulin 31 gmidl',
   'value': '1.5',
   'unit': None,
   'bio_reference_rang