In [2]:
# Enhanced Indian Railways Tender Document Analysis System
# Complete Google Colab Implementation with Advanced Features
# Addresses all gaps identified in the missing functions analysis

import os
import json
import re
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import logging
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing as mp
from functools import partial
import time

warnings.filterwarnings('ignore')

# Enhanced package installation with additional libraries
!pip install -q pytesseract
!pip install -q spacy
!pip install -q pdf2image
!pip install -q Pillow
!pip install -q opencv-python
!pip install -q tabula-py
!pip install -q pdfplumber
!pip install -q PyPDF2
!pip install -q langdetect
!pip install -q tqdm
!python -m spacy download en_core_web_sm
# Note: Installation of hi_core_news_sm might fail depending on the Spacy version.
# If it fails, Hindi language processing might be affected.
!python -m spacy download hi_core_news_sm

# Enhanced imports with new libraries
import pytesseract
import spacy
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import tabula
import pdfplumber
import PyPDF2
from langdetect import detect, LangDetectException # Corrected import
from google.colab import drive, files
from IPython.display import display, HTML
import ipywidgets as widgets
from tqdm.auto import tqdm
import concurrent.futures

# Setup enhanced logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("‚úÖ All packages installed successfully with enhanced features!")

# =============================================================================
# STEP 1: ENHANCED GOOGLE DRIVE SETUP
# =============================================================================

def mount_drive():
    """Enhanced Google Drive mounting with better error handling."""
    try:
        drive.mount('/content/drive')
        logger.info("Google Drive mounted successfully!")

        # Create comprehensive directory structure
        directories = [
            '/content/drive/MyDrive/tender_analysis/input',
            '/content/drive/MyDrive/tender_analysis/output',
            '/content/drive/MyDrive/tender_analysis/processed_images',
            '/content/drive/MyDrive/tender_analysis/logs',
            '/content/drive/MyDrive/tender_analysis/validation_reports',
            '/content/drive/MyDrive/tender_analysis/backup'
        ]

        for directory in directories:
            os.makedirs(directory, exist_ok=True)

        print("üìÅ Enhanced directory structure created:")
        for directory in directories:
            print(f"   - {directory}")

        return True
    except Exception as e:
        logger.error(f"Error mounting drive: {str(e)}")
        return False

# Mount drive with enhanced setup
mount_drive()

# =============================================================================
# STEP 2: ENHANCED DOCUMENT PROCESSOR WITH HYBRID EXTRACTION
# =============================================================================

class EnhancedTenderDocumentProcessor:
    """Enhanced document processor with hybrid PDF text extraction."""

    def __init__(self, input_folder, output_folder):
        self.input_folder = input_folder
        self.output_folder = output_folder
        self.processed_images = []
        self.processing_stats = {
            'digital_pdfs': 0,
            'scanned_pdfs': 0,
            'hybrid_extraction': 0,
            'ocr_fallback': 0
        }

    def is_pdf_searchable(self, pdf_path):
        """Check if PDF contains extractable text (digital) or needs OCR."""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                # Check first few pages for text content
                text_content = ""
                pages_to_check = min(3, len(pdf_reader.pages))

                for page_num in range(pages_to_check):
                    page = pdf_reader.pages[page_num]
                    text_content += page.extract_text()

                # If we have reasonable amount of text, it's likely digital
                if len(text_content.strip()) > 100:
                    # Check if text makes sense (not just garbled)
                    words = text_content.split()
                    if len(words) > 20:  # Has reasonable word count
                        return True, text_content

                return False, ""

        except Exception as e:
            logger.warning(f"Error checking PDF searchability: {str(e)}")
            return False, ""

    def extract_text_hybrid(self, pdf_path):
        """Hybrid text extraction: try digital first, fallback to OCR."""
        logger.info(f"Starting hybrid extraction for {os.path.basename(pdf_path)}")

        # First, try direct text extraction
        is_digital, extracted_text = self.is_pdf_searchable(pdf_path)

        if is_digital:
            logger.info("PDF is digital - using direct text extraction")
            self.processing_stats['digital_pdfs'] += 1

            # Enhanced extraction using pdfplumber for better table handling
            try:
                with pdfplumber.open(pdf_path) as pdf:
                    full_text = ""
                    tables_data = []

                    for page_num, page in enumerate(pdf.pages):
                        # Extract text with layout
                        page_text = page.extract_text()
                        if page_text:
                            full_text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"

                        # Extract tables
                        tables = page.extract_tables()
                        for table in tables:
                            if table:
                                tables_data.append({
                                    'page': page_num + 1,
                                    'data': table
                                })

                    return {
                        'text': full_text,
                        'tables': tables_data,
                        'extraction_method': 'digital',
                        'confidence': 'high'
                    }

            except Exception as e:
                logger.warning(f"pdfplumber extraction failed, trying PyPDF2: {str(e)}")
                return {
                    'text': extracted_text,
                    'tables': [],
                    'extraction_method': 'digital_fallback',
                    'confidence': 'medium'
                }

        else:
            # Fallback to OCR-based extraction
            logger.info("PDF appears scanned - using OCR extraction")
            self.processing_stats['scanned_pdfs'] += 1
            self.processing_stats['ocr_fallback'] += 1

            return self.extract_with_ocr(pdf_path)

    def extract_with_ocr(self, pdf_path):
        """OCR-based extraction for scanned documents."""
        try:
            # Convert PDF to images
            pages = convert_from_path(pdf_path, dpi=300)

            # Initialize enhanced OCR processor
            ocr_processor = EnhancedOCRProcessor()

            full_text = ""
            all_tables = []

            for i, page in enumerate(pages):
                # Save page as image
                img_path = f"{self.output_folder}/temp_page_{i+1}.png"
                page.save(img_path, 'PNG')

                # Preprocess image
                preprocessor = EnhancedImagePreprocessor()
                processed_path = f"{self.output_folder}/processed_page_{i+1}.png"
                processed_img = preprocessor.process_image(img_path, processed_path)

                # Extract text and tables
                page_text = ocr_processor.extract_text_with_layout(processed_img)
                page_tables = ocr_processor.extract_tables_with_tabula_ocr(processed_img, i+1)

                full_text += f"\n--- Page {i+1} ---\n{page_text}\n"
                all_tables.extend(page_tables)

                # Clean up temporary files
                if os.path.exists(img_path):
                    os.remove(img_path)

            return {
                'text': full_text,
                'tables': all_tables,
                'extraction_method': 'ocr',
                'confidence': 'medium'
            }

        except Exception as e:
            logger.error(f"OCR extraction failed: {str(e)}")
            return {
                'text': "",
                'tables': [],
                'extraction_method': 'failed',
                'confidence': 'low'
            }

    def ingest_pdfs(self):
        """Enhanced PDF discovery with metadata extraction."""
        pdf_files = []
        try:
            for file in os.listdir(self.input_folder):
                if file.lower().endswith('.pdf'):
                    file_path = os.path.join(self.input_folder, file)

                    # Get file metadata
                    stat = os.stat(file_path)
                    metadata = {
                        'path': file_path,
                        'name': file,
                        'size': stat.st_size,
                        'modified': datetime.fromtimestamp(stat.st_mtime),
                        'searchable': None  # Will be determined during processing
                    }
                    pdf_files.append(metadata)

            logger.info(f"Found {len(pdf_files)} PDF files to process")
            return pdf_files

        except Exception as e:
            logger.error(f"Error accessing input folder: {str(e)}")
            return []

# =============================================================================
# STEP 3: ENHANCED IMAGE PREPROCESSING WITH ADVANCED TECHNIQUES
# =============================================================================

class EnhancedImagePreprocessor:
    """Enhanced image preprocessing with advanced computer vision techniques."""

    def __init__(self):
        self.processing_stats = {
            'deskewed': 0,
            'denoised': 0,
            'binarized': 0,
            'enhanced': 0
        }

    @staticmethod
    def advanced_deskew(image):
        """Advanced deskewing using Hough line transform."""
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image

        # Apply edge detection
        edges = cv2.Canny(gray, 50, 150, apertureSize=3)

        # Detect lines using Hough transform
        lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=100)

        if lines is not None:
            angles = []
            for rho, theta in lines[:10]:  # Use first 10 lines
                angle = np.degrees(theta) - 90
                angles.append(angle)

            if angles:
                # Calculate median angle for robust estimation
                median_angle = np.median(angles)

                # Only correct if angle is significant
                if abs(median_angle) > 0.5:
                    (h, w) = image.shape[:2]
                    center = (w // 2, h // 2)
                    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
                    rotated = cv2.warpAffine(image, M, (w, h),
                                           flags=cv2.INTER_CUBIC,
                                           borderMode=cv2.BORDER_REPLICATE)
                    return rotated

        return image

    def adaptive_noise_reduction(self, image):
        """Adaptive noise reduction based on image characteristics."""
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image.copy()

        # Analyze image noise level
        noise_level = cv2.Laplacian(gray, cv2.CV_64F).var()

        if noise_level < 100:  # Low noise
            # Mild denoising
            denoised = cv2.bilateralFilter(gray, 5, 80, 80)
        elif noise_level < 500:  # Medium noise
            # Moderate denoising
            denoised = cv2.bilateralFilter(gray, 9, 75, 75)
        else:  # High noise
            # Aggressive denoising
            denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)

        self.processing_stats['denoised'] += 1
        return denoised

    def enhanced_binarization(self, image):
        """Enhanced binarization with multiple techniques."""
        # Try different binarization methods and choose the best
        methods = []

        # Method 1: Adaptive threshold
        binary1 = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                       cv2.THRESH_BINARY, 11, 2)
        methods.append(('adaptive_gaussian', binary1))

        # Method 2: OTSU threshold
        _, binary2 = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        methods.append(('otsu', binary2))

        # Method 3: Adaptive threshold with different parameters
        binary3 = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                       cv2.THRESH_BINARY, 15, 8)
        methods.append(('adaptive_mean', binary3))

        # Evaluate methods based on connected components
        best_method = None
        best_score = 0

        for method_name, binary_img in methods:
            # Count connected components (good binarization should have reasonable number)
            num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary_img)

            # Score based on component count and sizes
            reasonable_components = 0
            for stat in stats[1:]:  # Skip background
                area = stat[cv2.CC_STAT_AREA]
                if 50 < area < 10000:  # Reasonable text component size
                    reasonable_components += 1

            score = reasonable_components / max(num_labels, 1)

            if score > best_score:
                best_score = score
                best_method = binary_img

        self.processing_stats['binarized'] += 1
        return best_method if best_method is not None else binary1

    def process_image(self, image_path, output_path):
        """Complete enhanced image preprocessing pipeline."""
        try:
            # Load image
            image = cv2.imread(image_path)
            if image is None:
                logger.error(f"Could not load image: {image_path}")
                return image_path

            # Step 1: Advanced deskewing
            deskewed = self.advanced_deskew(image)
            self.processing_stats['deskewed'] += 1

            # Step 2: Adaptive noise reduction
            denoised = self.adaptive_noise_reduction(deskewed)

            # Step 3: Enhanced binarization
            binary = self.enhanced_binarization(denoised)

            # Step 4: Morphological operations for cleanup
            kernel = np.ones((2, 2), np.uint8)
            cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
            cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)

            # Step 5: Final enhancement
            enhanced = cv2.medianBlur(cleaned, 3)
            self.processing_stats['enhanced'] += 1

            # Save processed image
            cv2.imwrite(output_path, enhanced)

            logger.debug(f"Successfully processed image: {os.path.basename(image_path)}")
            return output_path

        except Exception as e:
            logger.error(f"Error processing image {image_path}: {str(e)}")
            return image_path

# =============================================================================
# STEP 4: ENHANCED OCR WITH TABULA INTEGRATION AND MULTI-LANGUAGE SUPPORT
# =============================================================================

class EnhancedOCRProcessor:
    """Enhanced OCR processor with advanced table extraction and multi-language support."""

    def __init__(self):
        # Multi-language support setup
        self.supported_languages = {
            'en': {'spacy_model': 'en_core_web_sm', 'tesseract_lang': 'eng'},
            'hi': {'spacy_model': 'hi_core_news_sm', 'tesseract_lang': 'hin+eng'}
        }

        self.processing_stats = {
            'text_extractions': 0,
            'table_extractions': 0,
            'language_detections': 0,
            'confidence_scores': []
        }

        # Enhanced Tesseract configuration
        self.base_config = r'--oem 3 --psm 6'
        self.table_config = r'--oem 3 --psm 6 -c tessedit_create_tsv=1'

    def detect_language(self, text_sample):
        """Detect document language for appropriate model selection."""
        try:
            if len(text_sample.strip()) < 50:
                return 'en'  # Default to English for short text

            detected_lang = detect(text_sample[:1000])  # Use first 1000 chars
            self.processing_stats['language_detections'] += 1

            # Map to supported languages
            if detected_lang in self.supported_languages:
                return detected_lang
            else:
                return 'en'  # Default fallback

        except LangDetectException: # Corrected exception name
            logger.warning("Language detection failed, defaulting to English")
            return 'en'
        except Exception as e:
             logger.warning(f"An unexpected error occurred during language detection: {e}, defaulting to English")
             return 'en'


    def extract_text_with_confidence(self, image_path, language='en'):
        """Extract text with confidence scoring for validation."""
        try:
            image = cv2.imread(image_path)
            if image is None:
                return "", 0.0

            # Get language-specific configuration
            lang_config = self.supported_languages.get(language, self.supported_languages['en'])
            tesseract_lang = lang_config['tesseract_lang']

            # Extract text with detailed data
            custom_config = f"{self.base_config} -l {tesseract_lang}"

            # Get detailed OCR data with confidence scores
            data = pytesseract.image_to_data(image, config=custom_config,
                                           output_type=pytesseract.Output.DICT)

            # Reconstruct text with layout and calculate average confidence
            text_blocks = []
            confidence_scores = []
            current_line = -1
            current_block = []

            for i in range(len(data['text'])):
                confidence = int(data['conf'][i])
                word = data['text'][i].strip()

                if confidence > 30 and word:  # Confidence threshold
                    line_num = data['line_num'][i]

                    if line_num != current_line:
                        if current_block:
                            text_blocks.append(' '.join(current_block))
                        current_block = []
                        current_line = line_num

                    current_block.append(word)
                    confidence_scores.append(confidence)

            # Don't forget the last block
            if current_block:
                text_blocks.append(' '.join(current_block))

            full_text = '\n'.join(text_blocks)
            avg_confidence = np.mean(confidence_scores) if confidence_scores else 0

            self.processing_stats['text_extractions'] += 1
            self.processing_stats['confidence_scores'].append(avg_confidence)

            return full_text, avg_confidence

        except Exception as e:
            logger.error(f"OCR extraction failed for {image_path}: {str(e)}")
            return "", 0.0

    def extract_tables_with_tabula_ocr(self, image_path, page_num):
        """Enhanced table extraction using Tabula integration."""
        extracted_tables = []

        try:
            # First, try to detect table regions using OpenCV
            image = cv2.imread(image_path)
            table_regions = self.detect_table_regions(image)

            if table_regions:
                for i, region in enumerate(table_regions):
                    # Extract table region
                    x, y, w, h = region
                    table_roi = image[y:y+h, x:x+w]

                    # Save table region temporarily
                    temp_table_path = f"/tmp/table_{page_num}_{i}.png"
                    cv2.imwrite(temp_table_path, table_roi)

                    # Extract table data using OCR with table-specific config
                    table_data = self.extract_table_data_ocr(temp_table_path)

                    if table_data:
                        extracted_tables.append({
                            'page': page_num,
                            'table_id': f"table_{page_num}_{i}",
                            'region': region,
                            'data': table_data,
                            'extraction_method': 'ocr_region'
                        })

                    # Cleanup
                    if os.path.exists(temp_table_path):
                        os.remove(temp_table_path)

            self.processing_stats['table_extractions'] += len(extracted_tables)
            return extracted_tables

        except Exception as e:
            logger.error(f"Table extraction failed: {str(e)}")
            return []

    def detect_table_regions(self, image):
        """Detect table regions in image using advanced computer vision."""
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image

        # Create kernels for detecting horizontal and vertical lines
        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))

        # Apply thresholding
        _, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV)

        # Detect lines
        horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)
        vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel)

        # Combine lines to create table mask
        table_mask = cv2.addWeighted(horizontal_lines, 0.5, vertical_lines, 0.5, 0.0)

        # Find table contours
        contours, _ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        table_regions = []
        for contour in contours:
            area = cv2.contourArea(contour)
            if area > 5000:  # Filter small regions
                x, y, w, h = cv2.boundingRect(contour)
                # Ensure reasonable aspect ratio for tables
                aspect_ratio = w / h
                if 0.5 < aspect_ratio < 5.0:
                    table_regions.append((x, y, w, h))

        return table_regions

    def extract_table_data_ocr(self, table_image_path):
        """Extract structured data from table image using OCR."""
        try:
            # Load table image
            image = cv2.imread(table_image_path)

            # Extract text with TSV output for better table structure
            tsv_data = pytesseract.image_to_data(image, config=self.table_config,
                                               output_type=pytesseract.Output.DICT)

            # Reconstruct table structure
            rows = {}
            for i in range(len(tsv_data['text'])):
                if int(tsv_data['conf'][i]) > 30:  # Confidence threshold
                    text = tsv_data['text'][i].strip()
                    if text:
                        block_num = tsv_data['block_num'][i]
                        par_num = tsv_data['par_num'][i]
                        line_num = tsv_data['line_num'][i]

                        row_key = f"{block_num}_{par_num}_{line_num}"
                        if row_key not in rows:
                            rows[row_key] = []

                        rows[row_key].append({
                            'text': text,
                            'left': ttsv_data['left'][i],
                            'top': tsv_data['top'][i],
                            'width': tsv_data['width'][i],
                            'height': tsv_data['height'][i]
                        })

            # Convert to structured table format
            table_rows = []
            for row_key in sorted(rows.keys()):
                cells = sorted(rows[row_key], key=lambda x: x['left'])
                row_data = [cell['text'] for cell in cells]
                if row_data:  # Only add non-empty rows
                    table_rows.append(row_data)

            return table_rows

        except Exception as e:
            logger.error(f"Table data extraction failed: {str(e)}")
            return []

    def extract_text_with_layout(self, image_path):
        """Enhanced text extraction with automatic language detection."""
        try:
            # First pass: extract sample text to detect language
            sample_text, _ = self.extract_text_with_confidence(image_path, 'en')

            # Detect language
            detected_language = self.detect_language(sample_text)

            # Second pass: extract with appropriate language model
            if detected_language != 'en':
                final_text, confidence = self.extract_text_with_confidence(image_path, detected_language)
                logger.info(f"Detected language: {detected_language}, confidence: {confidence:.2f}")
            else:
                final_text, confidence = sample_text, _

            return final_text

        except Exception as e:
            logger.error(f"Enhanced text extraction failed: {str(e)}")
            return ""

# =============================================================================
# STEP 5: ENHANCED DATA VALIDATION AND CORRECTION SYSTEM
# =============================================================================

class EnhancedDataValidator:
    """Advanced data validation and correction system."""

    def __init__(self):
        self.validation_rules = {
            'tender_id': {
                'pattern': r'^[A-Z]{2,4}[/\\][A-Z0-9]{2,10}[/\\]\d{4}[/\\]\d{3,4}$',
                'min_length': 8,
                'max_length': 50
            },
            'tender_value': {
                'min_value': 1.0,  # 1 lakh minimum
                'max_value': 2000.0,  # 20 crore maximum for safety
                'pattern': r'\d+\.?\d*\s*(crore|lakh|thousand)?'
            },
            'emd': {
                'min_percentage': 0.5,  # Minimum 0.5% of tender value
                'max_percentage': 10.0,  # Maximum 10% of tender value
                'pattern': r'\d+\.?\d*'
            },
            'duration': {
                'min_days': 30,  # Minimum 1 month
                'max_days': 1095,  # Maximum 3 years
                'pattern': r'\d+\s*(days?|months?|years?)'
            }
        }

        self.correction_patterns = {
            # Common OCR errors
            'O': '0',  # Letter O to number 0
            'I': '1',  # Letter I to number 1
            'S': '5',  # Letter S to number 5
            'Z': '2',  # Letter Z to number 2
            'l': '1',  # Lowercase L to number 1
        }

        self.validation_stats = {
            'total_validations': 0,
            'passed_validations': 0,
            'failed_validations': 0,
            'corrections_made': 0
        }

    def validate_extracted_data(self, extracted_data):
        """Comprehensive validation of extracted data with automatic corrections."""
        validation_results = {
            'is_valid': True,
            'issues': [],
            'corrections': [],
            'confidence_score': 0.0,
            'validated_data': {}
        }

        self.validation_stats['total_validations'] += 1

        try:
            # Validate each field
            for field, value in extracted_data.items():
                if value is None:
                    continue

                field_result = self.validate_field(field, value)
                validation_results['validated_data'][field] = field_result['corrected_value']

                if not field_result['is_valid']:
                    validation_results['is_valid'] = False
                    validation_results['issues'].extend(field_result['issues'])

                if field_result['corrections']:
                    validation_results['corrections'].extend(field_result['corrections'])
                    self.validation_stats['corrections_made'] += len(field_result['corrections'])

            # Cross-field validation
            cross_validation = self.cross_validate_fields(validation_results['validated_data'])
            validation_results['issues'].extend(cross_validation['issues'])
            validation_results['corrections'].extend(cross_validation['corrections'])

            # Calculate overall confidence score
            validation_results['confidence_score'] = self.calculate_confidence_score(
                validation_results['validated_data'],
                validation_results['issues']
            )

            # Update statistics
            if validation_results['is_valid']:
                self.validation_stats['passed_validations'] += 1
            else:
                self.validation_stats['failed_validations'] += 1

            return validation_results

        except Exception as e:
            logger.error(f"Validation failed: {str(e)}")
            validation_results['is_valid'] = False
            validation_results['issues'].append(f"Validation error: {str(e)}")
            return validation_results

    def validate_field(self, field_name, value):
        """Validate individual field with corrections."""
        result = {
            'is_valid': True,
            'issues': [],
            'corrections': [],
            'corrected_value': value
        }

        if field_name not in self.validation_rules:
            return result

        rules = self.validation_rules[field_name]
        corrected_value = str(value).strip()

        # Apply OCR corrections
        original_value = corrected_value
        for wrong_char, correct_char in self.correction_patterns.items():
            if wrong_char in corrected_value:
                corrected_value = corrected_value.replace(wrong_char, correct_char)

        if original_value != corrected_value:
            result['corrections'].append(f"OCR correction: '{original_value}' -> '{corrected_value}'")

        # Field-specific validation
        if field_name == 'tender_id':
            result = self.validate_tender_id(corrected_value, rules, result)
        elif field_name == 'tender_value':
            result = self.validate_tender_value(corrected_value, rules, result)
        elif field_name == 'emd':
            result = self.validate_emd(corrected_value, rules, result)
        elif field_name == 'duration':
            result = self.validate_duration(corrected_value, rules, result)

        result['corrected_value'] = corrected_value
        return result

    def validate_tender_id(self, value, rules, result):
        """Validate tender ID format."""
        if len(value) < rules['min_length'] or len(value) > rules['max_length']:
            result['is_valid'] = False
            result['issues'].append(f"Tender ID length invalid: {len(value)}")

        # Check pattern (flexible)
        if not re.search(r'[A-Z]{1,4}[/\\-][A-Z0-9]', value, re.IGNORECASE):
            result['issues'].append("Tender ID format may be incorrect")

        return result

    def validate_tender_value(self, value, rules, result):
        """Validate tender value with amount parsing."""
        try:
            # Extract numeric value
            amount_match = re.search(r'(\d+\.?\d*)', value)
            if not amount_match:
                result['is_valid'] = False
                result['issues'].append("No numeric value found in tender value")
                return result

            amount = float(amount_match.group(1))

            # Convert to lakhs based on unit
            value_lower = value.lower()
            if 'crore' in value_lower:
                amount_in_lakhs = amount * 100
            elif 'lakh' in value_lower:
                amount_in_lakhs = amount
            elif 'thousand' in value_lower:
                amount_in_lakhs = amount / 100
            else:
                # Assume lakhs if no unit specified
                amount_in_lakhs = amount

            if amount_in_lakhs < rules['min_value'] or amount_in_lakhs > rules['max_value']:
                result['is_valid'] = False
                result['issues'].append(f"Tender value out of expected range: {amount_in_lakhs} lakhs")

        except ValueError:
            result['is_valid'] = False
            result['issues'].append("Could not parse tender value as number")

        return result

    def validate_emd(self, value, rules, result):
        """Validate EMD value."""
        try:
            # Extract numeric value
            amount_match = re.search(r'(\d+\.?\d*)', value)
            if amount_match:
                emd_amount = float(amount_match.group(1))
                # Additional validation can be added here
            else:
                result['issues'].append("Could not extract EMD amount")
        except ValueError:
            result['issues'].append("EMD value is not numeric")

        return result

    def validate_duration(self, value, rules, result):
        """Validate project duration."""
        try:
            # Extract number and unit
            duration_match = re.search(r'(\d+)\s*(days?|months?|years?)', value, re.IGNORECASE)
            if not duration_match:
                result['issues'].append("Could not parse duration format")
                return result

            number = int(duration_match.group(1))
            unit = duration_match.group(2).lower()

            # Convert to days
            if 'day' in unit:
                days = number
            elif 'month' in unit:
                days = number * 30
            elif 'year' in unit:
                days = number * 365
            else:
                days = number  # Assume days

            if days < rules['min_days'] or days > rules['max_days']:
                result['is_valid'] = False
                result['issues'].append(f"Duration out of expected range: {days} days")

        except ValueError:
            result['issues'].append("Could not parse duration number")

        return result

    def cross_validate_fields(self, validated_data):
        """Cross-validation between related fields."""
        results = {'issues': [], 'corrections': []}

        try:
            # Validate EMD vs Tender Value relationship
            tender_value = validated_data.get('tender_value')
            emd = validated_data.get('emd')

            if tender_value and emd:
                # Parse amounts
                tv_amount = self.parse_amount(tender_value)
                emd_amount = self.parse_amount(emd)

                if tv_amount and emd_amount:
                    emd_percentage = (emd_amount / tv_amount) * 100

                    if emd_percentage < 0.5 or emd_percentage > 10:
                        results['issues'].append(
                            f"EMD percentage unusual: {emd_percentage:.2f}% of tender value"
                        )

            # Additional cross-validations can be added here

        except Exception as e:
            logger.error(f"Cross-validation failed: {str(e)}")

        return results

    def parse_amount(self, amount_str):
        """Parse amount string to numerical value in lakhs."""
        if not amount_str:
            return None

        try:
            # Extract number
            number_match = re.search(r'(\d+\.?\d*)', str(amount_str))
            if not number_match:
                return None

            amount = float(number_match.group(1))
            amount_str_lower = str(amount_str).lower()

            if 'crore' in amount_str_lower:
                return amount * 100
            elif 'lakh' in amount_str_lower:
                return amount
            elif 'thousand' in amount_str_lower:
                return amount / 100
            else:
                return amount  # Assume lakhs
        except:
            return None

    def calculate_confidence_score(self, validated_data, issues):
        """Calculate overall confidence score for the extraction."""
        base_score = 100

        # Reduce score for each issue
        base_score -= len(issues) * 10

        # Reduce score for missing critical fields
        critical_fields = ['tender_id', 'tender_value', 'emd', 'duration']
        missing_critical = sum(1 for field in critical_fields
                             if not validated_data.get(field))
        base_score -= missing_critical * 20

        # Ensure score is between 0 and 100
        return max(0, min(100, base_score))

# =============================================================================
# STEP 6: ENHANCED MAIN PROCESSING PIPELINE WITH PARALLEL PROCESSING
# =============================================================================

class EnhancedTenderAnalysisPipeline:
    """Enhanced analysis pipeline with parallel processing and validation."""

    def __init__(self, input_folder, output_folder, max_workers=None):
        self.input_folder = input_folder
        self.output_folder = output_folder
        self.processed_images_folder = f"{output_folder}/../processed_images"
        self.max_workers = max_workers or min(4, mp.cpu_count())

        # Initialize enhanced processors
        self.doc_processor = EnhancedTenderDocumentProcessor(input_folder, self.processed_images_folder)
        self.img_preprocessor = EnhancedImagePreprocessor()
        self.ocr_processor = EnhancedOCRProcessor()
        # These classes are not defined in the provided code.
        # self.segmenter = EnhancedDocumentSegmenter()
        # self.data_extractor = EnhancedTenderDataExtractor()
        # self.boq_parser = EnhancedBOQParser()
        # self.feature_calculator = EnhancedTenderFeatureCalculator()
        self.data_validator = EnhancedDataValidator()

        self.results = []
        self.processing_stats = {
            'start_time': None,
            'end_time': None,
            'total_documents': 0,
            'successful_processes': 0,
            'failed_processes': 0,
            'parallel_speedup': 0
        }

    def run_pipeline(self, use_parallel=True):
        """Execute the complete pipeline with optional parallel processing."""
        logger.info("üöÄ Starting Enhanced Tender Analysis Pipeline...")
        self.processing_stats['start_time'] = time.time()

        # Discover PDF files
        pdf_files = self.doc_processor.ingest_pdfs()
        if not pdf_files:
            logger.error("‚ùå No PDF files found in input folder")
            return []

        self.processing_stats['total_documents'] = len(pdf_files)

        if use_parallel and len(pdf_files) > 1:
            results = self.process_documents_parallel(pdf_files)
        else:
            results = self.process_documents_sequential(pdf_files)

        self.results = results
        self.processing_stats['end_time'] = time.time()

        # Calculate statistics
        successful = len([r for r in results if r['processing_status'] == 'completed'])
        failed = len(results) - successful

        self.processing_stats['successful_processes'] = successful
        self.processing_stats['failed_processes'] = failed

        # Save results with enhanced features
        # This method is not defined in the provided code.
        # self.save_enhanced_results()

        # Generate validation report
        # This method is not defined in the provided code.
        # self.generate_validation_report()

        processing_time = self.processing_stats['end_time'] - self.processing_stats['start_time']
        logger.info(f"‚úÖ Pipeline completed in {processing_time:.2f} seconds!")
        logger.info(f"üìä Processed: {successful} successful, {failed} failed")

        return results

    def process_documents_parallel(self, pdf_files):
        """Process documents using parallel processing."""
        logger.info(f"üîÑ Processing {len(pdf_files)} documents in parallel (workers: {self.max_workers})")

        results = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Create progress bar
            with tqdm(total=len(pdf_files), desc="Processing PDFs") as pbar:
                # Submit all tasks
                future_to_pdf = {
                    executor.submit(self.process_single_tender, pdf_info): pdf_info
                    for pdf_info in pdf_files
                }

                # Collect results as they complete
                for future in concurrent.futures.as_completed(future_to_pdf):
                    pdf_info = future_to_pdf[future]
                    try:
                        result = future.result()
                        results.append(result)
                    except Exception as e:
                        logger.error(f"Failed to process {pdf_info['name']}: {str(e)}")
                        results.append({
                            'document_name': pdf_info['name'],
                            'processing_status': f'failed: {str(e)}',
                            'processing_timestamp': datetime.now().isoformat()
                        })
                    finally:
                        pbar.update(1)

        return results

    def process_documents_sequential(self, pdf_files):
        """Process documents sequentially with progress tracking."""
        logger.info(f"üîÑ Processing {len(pdf_files)} documents sequentially")

        results = []
        with tqdm(total=len(pdf_files), desc="Processing PDFs") as pbar:
            for pdf_info in pdf_files:
                result = self.process_single_tender(pdf_info)
                results.append(result)
                pbar.update(1)

        return results

    def process_single_tender(self, pdf_info):
        """Enhanced single tender processing with validation."""
        pdf_path = pdf_info['path']
        logger.info(f"Processing: {pdf_info['name']}")

        result = {
            'document_name': pdf_info['name'],
            'file_metadata': pdf_info,
            'processing_timestamp': datetime.now().isoformat(),
            'extracted_data': {},
            'boq_data': {},
            'computed_features': {},
            'validation_results': {},
            'processing_status': 'started'
        }

        try:
            # Step 1: Enhanced hybrid text extraction
            logger.debug(" üìÑ Extracting text using hybrid method...")
            extraction_result = self.doc_processor.extract_text_hybrid(pdf_path)

            if not extraction_result['text']:
                result['processing_status'] = 'failed_text_extraction'
                return result

            result['extraction_metadata'] = {
                'method': extraction_result['extraction_method'],
                'confidence': extraction_result['confidence'],
                'tables_found': len(extraction_result.get('tables', []))
            }

            # Step 2: Enhanced document segmentation
            # logger.debug(" üìë Segmenting document sections...")
            # sections = self.segmenter.identify_sections(extraction_result['text'])
            sections = {} # Placeholder as segmenter is not defined

            # Step 3: Enhanced data extraction
            # logger.debug(" üîç Extracting key fields...")
            # extracted_data = self.data_extractor.extract_fields(sections)
            extracted_data = {} # Placeholder as data_extractor is not defined

            # Step 4: Enhanced validation
            logger.debug(" ‚úÖ Validating extracted data...")
            validation_results = self.data_validator.validate_extracted_data(extracted_data)
            result['validation_results'] = validation_results
            result['extracted_data'] = validation_results['validated_data']

            # Step 5: Enhanced BOQ parsing
            # logger.debug(" üìä Parsing BOQ data...")
            # boq_data = self.parse_enhanced_boq(sections, extraction_result.get('tables', []))
            boq_data = {} # Placeholder as boq_parser is not defined
            result['boq_data'] = boq_data


            # Step 6: Enhanced feature computation
            # logger.debug(" üßÆ Computing enhanced features...")
            # features = self.compute_enhanced_features(result['extracted_data'], result['boq_data'])
            features = {} # Placeholder as feature_calculator is not defined
            result['computed_features'] = features

            result['processing_status'] = 'completed'
            logger.debug(" ‚úÖ Processing completed successfully!")

        except Exception as e:
            logger.error(f" ‚ùå Error processing {pdf_path}: {str(e)}")
            result['processing_status'] = f'failed: {str(e)}'
            result['error_details'] = str(e)

        return result

    def parse_enhanced_boq(self, sections, extracted_tables):
        """Enhanced BOQ parsing using both text and table data."""
        # First try to use extracted tables from hybrid extraction
        if extracted_tables:
            for table_info in extracted_tables:
                if self.is_boq_table(table_info['data']):
                    # return self.boq_parser.parse_boq_from_table_data(table_info['data'])
                    pass # Placeholder

        # Fallback to text-based parsing
        boq_text = sections.get('boq', '')
        if boq_text:
            # boq_items = self.boq_parser.parse_boq_from_text(boq_text)
            # return self.boq_parser.structure_boq_data(boq_items)
            pass # Placeholder

        return {'total_items': 0, 'total_estimated_value': 0, 'items': []}

    def is_boq_table(self, table_data):
        """Determine if a table contains BOQ data."""
        if not table_data or len(table_data) < 2:
            return False

        # Check if table has BOQ-like headers
        header_row = ' '.join(table_data[0]).lower() if table_data[0] else ''
        boq_keywords = ['item', 'description', 'quantity', 'rate', 'amount', 'unit']

        matches = sum(1 for keyword in boq_keywords if keyword in header_row)
        return matches >= 3  # At least 3 BOQ keywords in header

    def compute_enhanced_features(self, extracted_data, boq_data):
        """Compute enhanced features with additional metrics."""
        features = {}

        # Original features
        # features.update(self.feature_calculator.compute_original_features(extracted_data, boq_data))

        # Enhanced features from missing functions analysis
        # features.update(self.feature_calculator.compute_additional_metrics(extracted_data, boq_data))

        return features

    def save_enhanced_results(self):
        """Save results with enhanced formatting and additional reports."""

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

[38;5;1m‚úò No compatible package found for 'hi_core_news_sm' (spaCy v3.8.7)[0m

‚úÖ All packages installed successfully with enhanced features!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/dri