In [18]:
import os
os.environ['PATH'] = r'C:\Program Files\poppler\Library\bin;' + os.environ['PATH']
import tkinter as tk
from tkinter import filedialog
import pytesseract
from PIL import Image
import pdf2image
import re
from typing import Dict, Any, Optional, List
import cv2
import numpy as np

class DocumentProcessor:
    PATTERNS = {
        'age': r"[Aa]ge\s*[:|-]?\s*(\d+)|age\s*(\d+)",
        'sex': r"[Ss]ex\s*[:|-]?\s*(female|male|m|f)|sex\s*(female|male|m|f)",
        'bmi': r"[Bb]mi\s*[:|-]?\s*(\d+\.?\d*)|bmi\s*(\d+\.?\d*)",
        'children': r"[Cc]hildren\s*[:|-]?\s*(\d+)|children\s*(\d+)",
        'region': r"[Rr]egion\s*[:|-]?\s*([a-zA-Z]+)|region\s*([a-zA-Z]+)",
        'charges': r"[Cc]harges\s*[:|-]?\s*(\d[\d,]*)|charges\s*(\d[\d,]*)"
    }

    SUPPORTED_EXTENSIONS = {
        'image': ['png', 'jpeg', 'jpg'],
        'pdf': ['pdf'],
        'text': ['txt']
    }

    def __init__(self):
        self.processed_files = {}

    def get_file_paths(self) -> List[str]:
        """Show file dialog and return selected file paths."""
        root = tk.Tk()
        root.attributes('-topmost', True)
        
        # Position window in center of screen
        screen_width = root.winfo_screenwidth()
        screen_height = root.winfo_screenheight()
        x = (screen_width - 200) // 2
        y = (screen_height - 200) // 2
        root.geometry(f'200x100+{x}+{y}')
        
        # Hide the main window but keep it active
        root.withdraw()
        
        try:
            filetypes = [
                ("All Supported Files", 
                 ("*.png", "*.jpg", "*.jpeg", "*.pdf", "*.txt")),
                ("Image Files", ("*.png", "*.jpg", "*.jpeg")),
                ("PDF Files", "*.pdf"),
                ("Text Files", "*.txt")
            ]
            
            file_paths = filedialog.askopenfilenames(
                parent=root,
                title="Select Files to Process",
                filetypes=filetypes,
                initialdir=os.getcwd()
            )
            
            return list(file_paths)
            
        finally:
            root.destroy()

    def run(self) -> Dict[str, Any]:
        """Main method to handle file selection and processing."""
        try:
            file_paths = self.get_file_paths()
            
            if not file_paths:
                print("No files selected.")
                return {}
                
            print(f"Selected {len(file_paths)} files.")
            return self.process_uploaded_files(file_paths)
            
        except Exception as e:
            print(f"Error during processing: {e}")
            return {"error": str(e)}

    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
        """Apply various preprocessing techniques to improve OCR accuracy."""
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image

        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
        denoised = cv2.fastNlMeansDenoising(thresh)
        
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(denoised)

        kernel = np.array([[-1,-1,-1],
                         [-1, 9,-1],
                         [-1,-1,-1]])
        sharpened = cv2.filter2D(enhanced, -1, kernel)

        kernel = np.ones((1,1), np.uint8)
        dilated = cv2.dilate(sharpened, kernel, iterations=1)

        return dilated

    def _extract_from_image(self, path: str) -> Optional[str]:
        """Extract text from image using OCR with preprocessing."""
        try:
            image = cv2.imread(path)
            if image is None:
                raise ValueError("Failed to load image")

            processed_image = self.preprocess_image(image)
            cv2.imwrite('debug_processed.png', processed_image)
            print("Saved preprocessed image as 'debug_processed.png'")

            pil_image = Image.fromarray(processed_image)
            custom_config = r'--oem 3 --psm 6'
            text = pytesseract.image_to_string(pil_image, config=custom_config)
            print("Extracted Text:", text)

            return text
        except Exception as e:
            print(f"Image extraction error: {e}")
            return None

    def _extract_from_pdf(self, path: str) -> Optional[str]:
        """Extract text from PDF using OCR with preprocessing."""
        try:
            pages = pdf2image.convert_from_path(path)
            text = []
            for page in pages:
                opencv_image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
                processed_image = self.preprocess_image(opencv_image)
                pil_image = Image.fromarray(processed_image)
                custom_config = r'--oem 3 --psm 6'
                text.append(pytesseract.image_to_string(pil_image, config=custom_config))
            return ' '.join(text)
        except Exception as e:
            print(f"PDF extraction error: {e}")
            return None

    def _extract_from_text(self, path: str) -> Optional[str]:
        """Extract text from text file."""
        try:
            with open(path, 'r') as file:
                return file.read()
        except Exception as e:
            print(f"Text file extraction error: {e}")
            return None

    def extract_text(self, file_path: str) -> Optional[str]:
        """Extract text from different file types."""
        extension = file_path.split('.')[-1].lower()

        extractors = {
            'image': self._extract_from_image,
            'pdf': self._extract_from_pdf,
            'text': self._extract_from_text
        }

        for file_type, extensions in self.SUPPORTED_EXTENSIONS.items():
            if extension in extensions:
                return extractors[file_type](file_path)

        raise ValueError(f"Unsupported file type: {extension}")

    def process_uploaded_files(self, uploaded_files: list) -> Dict[str, Any]:
        """Process each uploaded file and extract claim data."""
        for file_path in uploaded_files:
            try:
                text = self.extract_text(file_path)
                if text:
                    self.processed_files[file_path] = self.extract_claim_data(text)
                else:
                    self.processed_files[file_path] = {"error": "Text extraction failed"}
            except Exception as e:
                self.processed_files[file_path] = {"error": str(e)}

        return self.processed_files

    def extract_claim_data(self, text: str) -> Dict[str, Any]:
        """Extract and validate claim data from text."""
        claim_data = {}

        for field, pattern in self.PATTERNS.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = next((g for g in match.groups() if g is not None), None)
                
                if value:
                    if field == 'sex':
                        value = 'Male' if value.lower() in ['m', 'male'] else 'Female'
                    elif field in ['age', 'children']:
                        value = int(value)
                    elif field == 'bmi':
                        value = float(value)
                    elif field == 'charges':
                        value = int(value.replace(',', ''))
                    elif field == 'region':
                        value = value.strip().lower().capitalize()

                    claim_data[field] = value
            else:
                claim_data[field] = None
        
        return claim_data

if __name__ == "__main__":
    processor = DocumentProcessor()
    print("Starting document processor...")
    results = processor.run()
    print("\nProcessing Results:")
    for file_path, result in results.items():
        print(f"\nFile: {os.path.basename(file_path)}")
        print(f"Results: {result}")

Starting document processor...
Selected 1 files.

Processing Results:

File: Untitled document.pdf
Results: {'age': 30, 'sex': 'Female', 'bmi': 22.0, 'children': 5, 'region': 'Southeast', 'charges': 1000000}


In [21]:
import requests

def send_to_backend(extracted_data):
    url = "http://127.0.0.1:8000/process-claim/"  # Update with your backend's URL
    for file, data in extracted_data.items():
        if "error" in data:
            print(f"Error processing {file}: {data['error']}")
            continue

        try:
            response = requests.post(url, json=data)
            if response.status_code == 200:
                print(f"Claim for {file} processed successfully: {response.json()}")
            else:
                print(f"Failed to process {file}: {response.status_code}, {response.text}")
        except Exception as e:
            print(f"Error sending data for {file}: {e}")

# Send processed claims
send_to_backend(results)

Claim for C:/Users/ASUS/Downloads/Untitled document.pdf processed successfully: {'message': 'Claim processed successfully', 'claim_id': 1, 'fraudulent': False}
