In [1]:
import os
import tkinter as tk
from tkinter import filedialog
import pytesseract
from PIL import Image
import pdf2image
import re
from typing import Dict, Any, Optional
import cv2
import numpy as np

class DocumentProcessor:
    PATTERNS = {
        'age': r"[Aa]ge\s*[:|-]?\s*(\d+)|age\s*(\d+)",
        'sex': r"[Ss]ex\s*[:|-]?\s*(female|male|m|f)|sex\s*(female|male|m|f)",
        'bmi': r"[Bb]mi\s*[:|-]?\s*(\d+\.?\d*)|bmi\s*(\d+\.?\d*)|[Bb]inied\s*(\d+\.?\d*)",
        'children': r"[Cc]hildren\s*[:|-]?\s*(\d+)|children\s*(\d+)",
        'smoker': r"[Ss]moker\s*[:|-]?\s*(yes|no|y|n)|smoker\s*(yes|no|y|n)",
        'region': r"[Rr]egion\s*[:|-]?\s*([a-zA-Z]+)|region\s*([a-zA-Z]+)",
        'charges': r"[Cc]harges\s*[:|-]?\s*(\d[\d,]*)|charges\s*(\d[\d,]*)"
    }

    SUPPORTED_EXTENSIONS = {
        'image': ['png', 'jpeg', 'jpg'],
        'pdf': ['pdf'],
        'text': ['txt']
    }

    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust the path as needed

    def __init__(self):
        self.processed_files = {}

    def upload_and_process(self) -> Dict[str, Any]:
        """Open file dialog to select files and process them."""
        # Initialize tkinter window
        root = tk.Tk()
        root.withdraw()  # Hide the main window

        # Open file dialog to select one or more files
        file_paths = filedialog.askopenfilenames(title="Select Files", filetypes=[("Image Files", "*.png;*.jpg;*.jpeg"), 
                                                                                ("PDF Files", "*.pdf"), 
                                                                                ("Text Files", "*.txt")])
        
        # Process each selected file
        return self.process_uploaded_files(file_paths)

    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
        """Apply various preprocessing techniques to improve OCR accuracy."""
        # Convert to grayscale if not already
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image

        # Apply thresholding to handle shadows and normalize brightness
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

        # Remove noise
        denoised = cv2.fastNlMeansDenoising(thresh)

        # Enhance contrast
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(denoised)

        # Sharpen the image
        kernel = np.array([[-1,-1,-1],
                         [-1, 9,-1],
                         [-1,-1,-1]])
        sharpened = cv2.filter2D(enhanced, -1, kernel)

        # Dilate text to make it more prominent
        kernel = np.ones((1,1), np.uint8)
        dilated = cv2.dilate(sharpened, kernel, iterations=1)

        return dilated

    def _extract_from_image(self, path: str) -> Optional[str]:
        """Extract text from image using OCR with preprocessing."""
        try:
            # Read image with OpenCV
            image = cv2.imread(path)
            if image is None:
                raise ValueError("Failed to load image")

            # Preprocess the image
            processed_image = self.preprocess_image(image)

            # Save debug image to check preprocessing results
            cv2.imwrite('debug_processed.png', processed_image)
            print("Saved preprocessed image as 'debug_processed.png'")

            # Convert to PIL Image for tesseract
            pil_image = Image.fromarray(processed_image)

            # Configure tesseract parameters
            custom_config = r'--oem 3 --psm 6'  # Assume uniform text with consistent spacing

            # Extract text
            text = pytesseract.image_to_string(pil_image, config=custom_config)

            # Print extracted text for debugging
            print("Extracted Text:", text)

            return text
        except Exception as e:
            print(f"Image extraction error: {e}")
            return None

    def _extract_from_pdf(self, path: str) -> Optional[str]:
        """Extract text from PDF using OCR with preprocessing."""
        try:
            pages = pdf2image.convert_from_path(path)
            text = []
            for page in pages:
                # Convert PIL Image to OpenCV format
                opencv_image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
                # Preprocess
                processed_image = self.preprocess_image(opencv_image)
                # Convert back to PIL for tesseract
                pil_image = Image.fromarray(processed_image)
                # Extract text with custom config
                custom_config = r'--oem 3 --psm 6'
                text.append(pytesseract.image_to_string(pil_image, config=custom_config))
            return ' '.join(text)
        except Exception as e:
            print(f"PDF extraction error: {e}")
            return None

    def _extract_from_text(self, path: str) -> Optional[str]:
        """Extract text from text file."""
        try:
            with open(path, 'r') as file:
                return file.read()
        except Exception as e:
            print(f"Text file extraction error: {e}")
            return None

    def extract_text(self, file_path: str) -> Optional[str]:
        """Extract text from different file types."""
        extension = file_path.split('.')[-1].lower()

        extractors = {
            'image': self._extract_from_image,
            'pdf': self._extract_from_pdf,
            'text': self._extract_from_text
        }

        for file_type, extensions in self.SUPPORTED_EXTENSIONS.items():
            if extension in extensions:
                return extractors[file_type](file_path)

        raise ValueError(f"Unsupported file type: {extension}")

    def process_uploaded_files(self, uploaded_files: list) -> Dict[str, Any]:
        """Process each uploaded file and extract claim data."""
        for file_path in uploaded_files:
            try:
                text = self.extract_text(file_path)
                if text:
                    self.processed_files[file_path] = self.extract_claim_data(text)
                else:
                    self.processed_files[file_path] = {"error": "Text extraction failed"}

            except Exception as e:
                self.processed_files[file_path] = {"error": str(e)}

        return self.processed_files

    def extract_claim_data(self, text: str) -> Dict[str, Any]:
        """Extract and validate claim data from text with improved pattern matching."""
        claim_data = {}

        for field, pattern in self.PATTERNS.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                # Get the first non-None group (some patterns have multiple capture groups)
                value = next((g for g in match.groups() if g is not None), None)

                if value:
                    if field == 'sex':
                        # Handle single letter answers
                        value = 'Male' if value.lower() in ['m', 'male'] else 'Female'
                    elif field in ['age', 'children']:
                        value = int(value)
                    elif field == 'bmi':
                        value = float(value)
                    elif field == 'charges':
                        value = int(value.replace(',', ''))
                    elif field == 'smoker':
                        # Handle single letter answers
                        value = 'Yes' if value.lower() in ['y', 'yes'] else 'No'
                    elif field == 'region':
                        value = value.strip().lower().capitalize()

                    claim_data[field] = value
            else:
                claim_data[field] = None

        # Additional cleanup for specific OCR issues
        text = text.lower()

        # Try to catch age if it was missed (sometimes OCR merges it)
        if claim_data['age'] is None:
            age_match = re.search(r'age.*?(\d+)', text)
            if age_match:
                claim_data['age'] = int(age_match.group(1))

        # Try to catch BMI if it was missed (handle common OCR errors)
        if claim_data['bmi'] is None:
            bmi_match = re.search(r'b[im]i.*?(\d+)', text)  # Handle 'm' being read as 'in'
            if bmi_match:
                claim_data['bmi'] = float(bmi_match.group(1))

        return claim_data


ModuleNotFoundError: No module named 'pytesseract'

In [None]:
import tkinter as tk
from tkinter import filedialog

root = tk.Tk()
root.withdraw()  # Hide main window
file_path = filedialog.askopenfilename(title="Select a File")
print(f"Selected File: {file_path}")