In [None]:
import os
import re
import base64
import json
from collections import OrderedDict
import time
from io import BytesIO
from typing import Dict, Any, Optional
from flask import Flask, request, jsonify, current_app
from flask_cors import CORS
from PIL import Image
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import pandas as pd
from fuzzywuzzy import process

app = Flask(__name__)
CORS(app)

# Configuration
JSON_FILE_PATH = r"C:\Users\Admin\Downloads\bajaj_project_github\output.json"
EXCEL_FILE_PATH = r"C:\Users\Admin\Downloads\bajaj_project_github\OUTPUTS\output_file.xlsx"
ICD_CODE_JSON_PATH = r'C:\Users\Admin\Downloads\output (1).json'
API_KEY = "AIzaSyCQrYGVRTNivr4Dh_xhJLkVovy6kDEFhKY"  
ICD_CODE_PATH = r"C:\Users\Admin\Downloads\ICD.xlsx"

# Global variables to store the ICD data
disease_icd_dict = None

def load_icd_codes(icd_code_path):
    df = pd.read_excel(icd_code_path)
    df.columns = df.columns.str.strip().str.lower()
    return dict(zip(df['diseases'], df['icd10_codes']))

def get_icd_code(disease_name, threshold=80):
    global disease_icd_dict
    if disease_icd_dict is None:
        raise ValueError("ICD codes have not been loaded. Call load_icd_codes() first.")
    
    match = process.extractOne(disease_name, disease_icd_dict.keys())
    if match and match[1] >= threshold:
        return disease_icd_dict[match[0]]
    return None

def process_local_image(image_path: str) -> bytes:
    try:
        with Image.open(image_path) as image:
            if image.mode not in ('RGB', 'L'):
                image = image.convert('RGB')
            
            buffered = BytesIO()
            image.save(buffered, format="JPEG", quality=95, optimize=True)
            image_bytes = buffered.getvalue()
            
            if len(image_bytes) == 0:
                raise ValueError("Processed image is empty")
                
            return image_bytes
    except IOError as e:
        raise IOError(f"Error opening image: {str(e)}")
    except Exception as e:
        raise Exception(f"Unexpected error processing image: {str(e)}")

def clean_text(text: str) -> str:
    # Existing cleaning rules
    text = re.sub(r'\b-\b', '-', text)
    text = re.sub(r"\b'\b|\B'\b|\b'\B", "", text)
    text = re.sub(r"\\(\w+)\\", r"\1", text)
    text = re.sub(r'\s\?\s', ' ', text)
    text = re.sub(r'\b(?:age|a\.g|ag|age\.)\b', 'age', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(?:on|o\.n|on\.|o)\b', 'on', text, flags=re.IGNORECASE)
    text = re.sub(r'/\s*\(per\)', '', text)
    text = re.sub(r'\bc/0\b|\bclo\b|\bc/on\b|\bcLo\b|\bcho\b', 'Complains of', text, flags=re.IGNORECASE)
    text = re.sub(r'to\s*\(telephone order\)', 'to', text, flags=re.IGNORECASE)
    text = re.sub(r'AGE\s*\(acute gastro enteritis\)', 'age', text, flags=re.IGNORECASE)
    
    # Handle arrow symbols
    text = text.replace("↑", "increased ")
    text = text.replace("↓", "decreased ")
    text = text.replace("→", "leads to ")
    
    return text

def expand_abbreviations(text: str) -> str:
    abbreviations = {
        r'\bHTN\b': 'Hypertension',
        r'\bDM\b': 'Diabetes Mellitus',
        r'\bCHF\b': 'Congestive Heart Failure',
        r'\bCOPD\b': 'Chronic Obstructive Pulmonary Disease',
        r'\bMI\b': 'Myocardial Infarction',
        r'\bRA\b': 'Rheumatoid Arthritis',
        r'\bCKD\b': 'Chronic Kidney Disease',
        r'\bDVT\b': 'Deep Vein Thrombosis',
        r'\bGERD\b': 'Gastroesophageal Reflux Disease',
        r'\bMS\b': 'Multiple Sclerosis',
        r'\bIVDD\b': 'Intervertebral Disc Disease',
        r'\bCAD\b': 'Coronary Artery Disease',
        r'\bUTI\b': 'Urinary Tract Infection',
        r'\bCVA\b': 'Cerebrovascular Accident',
        r'\bAFib\b': 'Atrial Fibrillation',
        r'\bOA\b': 'Osteoarthritis',
        r'\bIBS\b': 'Irritable Bowel Syndrome',
        r'\bGI\b': 'Gastrointestinal',
        r'\bPVD\b': 'Peripheral Vascular Disease',
        r'\bTIA\b': 'Transient Ischemic Attack',
    }
    
    for abbr, full_form in abbreviations.items():
        text = re.sub(abbr, full_form, text, flags=re.IGNORECASE)
    
    return text

def detailed_image_analysis(image_path: str) -> Dict[str, Any]:
    try:
        genai.configure(api_key=API_KEY)

        generation_config = {
            "temperature": 0.1,
            "top_p": 1,
            "top_k": 32,
            "max_output_tokens": 2048,
        }

        safety_settings = [
            {"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
            {"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
            {"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
            {"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
        ]

        model = genai.GenerativeModel(
            model_name="gemini-1.5-flash",
            generation_config=generation_config,
            safety_settings=safety_settings
        )

        image_data = process_local_image(image_path)
        image_part = {"mime_type": "image/jpeg", "data": base64.b64encode(image_data).decode('utf-8')}

        detailed_prompt = """
        Analyze this document image in detail. Extract and categorize information based on the following criteria:

        1. Text Analysis:
           - Identify all text in the image.
           - Detect any underlined text and specify which words/phrases are underlined.
           - Identify segmented text (text split into separate boxes or sections).
           - Detect if any text is cut off or partially visible.
           - Identify any crossed-out or strikethrough text.

        2. Symbol Detection:
           - Detect the presence of checkboxes and whether they are checked or unchecked.
           - Identify any stars (★) or asterisks (*) near text and specify their location (before or after the text).

        3. Shape Analysis:
           - Identify any shapes present in the document (circles, squares, triangles, etc.).
           - Describe the relationship between shapes and nearby text.

        4. Medication Information (if applicable):
           - Extract all mentions of medications (e.g., med1, med2, med3, med4).
           - For each medication, provide details and any associated shapes or symbols.

        5. Form Structure:
           - Identify form fields, labels, and their corresponding values.
           - Detect any tables and describe their content.

        6. Special Cases:
           - Note any handwritten text and its location.
           - Identify any logos, stamps, or signatures.
           - Detect any QR codes or barcodes.

        7. Image Quality Assessment:
           - Evaluate the overall image quality (e.g., clear, blurry, skewed).
           - Note any issues like poor contrast, shadows, or reflections.

        Provide the analysis in the following JSON format:

        {
            "text_analysis": {
                "all_text": [],
                "underlined_text": [],
                "segmented_text": [],
                "cut_off_text": [],
                "crossed_out_text": []
            },
            "symbol_detection": {
                "checkboxes": [
                    {"text": "", "status": "checked/unchecked"}
                ],
                "stars_asterisks": [
                    {"text": "", "symbol": "★/*", "position": "before/after"}
                ]
            },
            "shape_analysis": [
                {"shape": "", "description": "", "associated_text": ""}
            ],
            "medication_info": {
                "med1": {"mentions": [], "details": "", "associated_elements": ""},
                "med2": {"mentions": [], "details": "", "associated_elements": ""},
                "med3": {"mentions": [], "details": "", "associated_elements": ""},
                "med4": {"mentions": [], "details": "", "associated_elements": ""}
            },
            "form_structure": {
                "fields": [
                    {"label": "", "value": ""}
                ],
                "tables": [
                    {"description": "", "content": ""}
                ]
            },
            "special_cases": {
                "handwritten_text": [],
                "logos_stamps_signatures": [],
                "qr_barcodes": []
            },
            "image_quality": {
                "overall_quality": "",
                "issues": []
            },
            "confidence_score": 0
        }

        Analyze the image thoroughly and fill in all relevant sections of the JSON structure. If a section is not applicable, leave it as an empty list or string as appropriate.
        """

        prompt_parts = [detailed_prompt, image_part]

        response = model.generate_content(prompt_parts)
        if not response or not response.text:
            raise ValueError("Empty response from model")

        response_text = response.text.strip()
        current_app.logger.debug(f"Raw response from model: {response_text}")

        json_match = re.search(r'(\{.*\})', response_text, re.DOTALL)
        if not json_match:
            raise ValueError("No valid JSON found in the response")

        json_content = json_match.group(1)
        cleaned_json = clean_text(json_content)
        current_app.logger.debug(f"Cleaned JSON: {cleaned_json}")

        parsed_json = json.loads(cleaned_json, object_pairs_hook=OrderedDict)
        return parsed_json

    except Exception as e:
        current_app.logger.error(f"Error in detailed image analysis: {str(e)}")
        raise Exception(f"Error in detailed image analysis: {str(e)}")

def analyze_document_from_path(image_path: str) -> Dict[str, Any]:
    try:
        genai.configure(api_key=API_KEY)

        generation_config = {
            "temperature": 0.1,
            "top_p": 1,
            "top_k": 32,
            "max_output_tokens": 2048,
        }

        safety_settings = [
            {"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
            {"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
            {"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
            {"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
        ]

        model = genai.GenerativeModel(
            model_name="gemini-1.5-flash",
            generation_config=generation_config,
            safety_settings=safety_settings
        )

        image_data = process_local_image(image_path)
        image_part = {"mime_type": "image/jpeg", "data": base64.b64encode(image_data).decode('utf-8')}

        detailed_analysis = detailed_image_analysis(image_path)

        # Process the detailed analysis to extract provisional diagnosis and remarks
        provisional_diagnosis = []
        remarks = []

        # Check text_analysis for potential diagnoses
        all_text = detailed_analysis.get('text_analysis', {}).get('all_text', [])
        for text in all_text:
            if re.search(r'\b(diagnosis|assessment|impression)\b', text, re.IGNORECASE):
                provisional_diagnosis.append(text)
            else:
                remarks.append(text)

        # Check form_structure for potential diagnoses
        fields = detailed_analysis.get('form_structure', {}).get('fields', [])
        for field in fields:
            if re.search(r'\b(diagnosis|assessment|impression)\b', field.get('label', ''), re.IGNORECASE):
                provisional_diagnosis.append(f"{field['label']}: {field['value']}")
            else:
                remarks.append(f"{field['label']}: {field['value']}")

        # Join provisional diagnosis and remarks
        provisional_diagnosis_text = ' || '.join(provisional_diagnosis)
        remarks_text = ' '.join(remarks)

        prompt_template = f"""
        Analyze the image and extract all visible text and relevant medical information. Use the following detailed analysis as additional context:

        Provisional Diagnosis: {provisional_diagnosis_text}
        Remarks: {remarks_text}

        Based on this analysis:

        1. For 'Provisional Diagnosis' that may contain multiple unique values, separate them with "||" only if the values are unique. Avoid duplicating values.
        2. If any information cannot be mapped to the predefined JSON format, summarize it concisely in one to two lines in the 'remark' section.
        3. Differentiate between disease names and medicine names:
           - Place disease names under the 'Provisional_Diagnosis' field.
           - Include medicine names in the 'remark' section.
        4. For abbreviations in the 'Provisional_Diagnosis':
           - Treat abbreviations as case-insensitive (e.g., "htn" and "HTN" are equivalent).
           - Identify and expand abbreviations using the surrounding context.
           - If the abbreviation is common and its meaning is clear, replace it with its full form.
           - If multiple interpretations exist, choose the most appropriate one based on the medical context.
           - Preserve the original case of surrounding text while replacing abbreviations.
           - If the abbreviation's meaning is unclear, leave it unchanged and note it in the 'remark' section.
           - Multiple abbreviations can appear in a single diagnosis; map their full forms in place of abbreviations.
        
        Return the results in this exact JSON format and order:

        {{
            "Provisional_Diagnosis": "",
            "remark": ""
        }}
        """

        prompt_parts = [prompt_template, image_part]

        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = model.generate_content(prompt_parts)
                if not response or not response.text:
                    raise ValueError("Empty response from model")

                response_text = response.text.strip()
                current_app.logger.debug(f"Raw response from model: {response_text}")

                json_match = re.search(r'(\{.*\})', response_text, re.DOTALL)
                if not json_match:
                    raise ValueError("No valid JSON found in the response")

                json_content = json_match.group(1)
                cleaned_json = clean_text(json_content)
                current_app.logger.debug(f"Cleaned JSON: {cleaned_json}")

                parsed_json = json.loads(cleaned_json, object_pairs_hook=OrderedDict)

                # Process multiple values and keep only unique ones
                for key, value in parsed_json.items():
                    if isinstance(value, str) and '||' in value:
                        unique_values = list(OrderedDict.fromkeys(value.split('||')))
                        parsed_json[key] = '||'.join(unique_values)

                # Expand abbreviations in Provisional_Diagnosis
                if 'Provisional_Diagnosis' in parsed_json:
                    parsed_json['Provisional_Diagnosis'] = expand_abbreviations(parsed_json['Provisional_Diagnosis'])

                # Add ICD code lookup
                if 'Provisional_Diagnosis' in parsed_json:
                    diagnoses = parsed_json['Provisional_Diagnosis'].split('||')
                    
                    icd_codes = []
                    for diagnosis in diagnoses:
                        icd_code = get_icd_code(diagnosis.strip())
                        if icd_code:
                            icd_codes.append(icd_code)
                    
                    if icd_codes:
                        parsed_json['Provisional_Diagnosis_ICD10_Code'] = "||".join(OrderedDict.fromkeys(icd_codes))
                    else:
                        parsed_json['Provisional_Diagnosis_ICD10_Code'] = "No matching ICD code found"

                return parsed_json

            except Exception as e:
                current_app.logger.error(f"Error in analyzing document: {str(e)}")
                if attempt == max_retries - 1:
                    raise Exception(f"Failed after {max_retries} attempts: {str(e)}")
                time.sleep(2 ** attempt)

    except Exception as e:
        current_app.logger.error(f"Error in analyzing document: {str(e)}")
        raise Exception(f"Error in analyzing document: {str(e)}")

# The rest of the code remains the same
def update_excel_with_new_data(new_data: Dict[str, Any]) -> bool:
    try:
        if os.path.exists(EXCEL_FILE_PATH):
            existing_df = pd.read_excel(EXCEL_FILE_PATH)
        else:
            existing_df = pd.DataFrame()

        new_df = pd.DataFrame([new_data])
        updated_df = pd.concat([existing_df, new_df], ignore_index=True)
        updated_df.to_excel(EXCEL_FILE_PATH, index=False)
        current_app.logger.info("Excel file updated successfully.")
        return True
    except Exception as e:
        current_app.logger.error(f"Error updating Excel: {str(e)}")
        return False

@app.before_first_request
def initialize_data():
    global disease_icd_dict
    disease_icd_dict = load_icd_codes(ICD_CODE_PATH)

@app.route('/process_image', methods=['POST'])
def process_image():
    try:
        current_app.logger.debug("Request Headers: %s", request.headers)
        current_app.logger.debug("Content-Type: %s", request.content_type)

        data = request.get_json()
        current_app.logger.debug("Received JSON data: %s", data)

        if not data or 'image_path' not in data:
            current_app.logger.error("Error: No image path provided in the request.")
            return jsonify({'error': 'No image path provided'}), 400

        image_path = data['image_path']
        if not isinstance(image_path, str) or not os.path.exists(image_path):
            current_app.logger.error("Error: Invalid image path or file does not exist.")
            return jsonify({'error': 'Invalid image path or file does not exist'}), 400

        current_app.logger.info(f"Processing image from path: {image_path}")
        extracted_data = analyze_document_from_path(image_path)
        current_app.logger.debug(extracted_data)

        with open(JSON_FILE_PATH, 'w', encoding='utf-8') as json_file:
            json.dump(extracted_data, json_file, indent=4)

        excel_update_success = update_excel_with_new_data(extracted_data)

        return jsonify(extracted_data)

    except Exception as e:
        current_app.logger.error(f"Error during image processing: {str(e)}")
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False, port=5000)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
[2024-10-13 05:54:53,536] DEBUG in 2746024367: Request Headers: Content-Type: application/json
User-Agent: PostmanRuntime/7.42.0
Accept: */*
Postman-Token: d63926a2-4eca-4a48-ac05-ae3fe5c24877
Host: 127.0.0.1:5000
Accept-Encoding: gzip, deflate, br
Connection: keep-alive
Content-Length: 111


[2024-10-13 05:54:53,536] DEBUG in 2746024367: Content-Type: application/json
[2024-10-13 05:54:53,536] DEBUG in 2746024367: Received JSON data: {'image_path': 'C:\\Users\\Admin\\Downloads\\PS2_Level_2 Dataset\\PS2 Level 2 Dataset\\sample_6.png'}
[2024-10-13 05:54:53,536] INFO in 2746024367: Processing image from path: C:\Users\Admin\Downloads\PS2_Level_2 Dataset\PS2 Level 2 Dataset\sample_6.png
[2024-10-13 05:55:01,139] DEBUG in 2746024367: Raw response from model: ```json
{
  "text_analysis": {
    "all_text": [
      "C.",
      "Nature of illness / Disease with presenting complaint:",
      "Clo Dimulubu of urubu in Right Eye x2 months",