<a href="https://colab.research.google.com/github/Namra-3305/PaperSense-Intelligent-Document-Platform/blob/main/funsd_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (19 kB)
Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.12.0.88


In [4]:
"""
FUNSD Dataset Preprocessing
Handles form understanding and layout analysis data preparation
"""

import os
import json
import pandas as pd
import numpy as np
import torch
import cv2
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from collections import Counter, defaultdict
from tqdm import tqdm
import pickle
import requests
import zipfile

# Project paths
PROJECT_ROOT = "/content/drive/MyDrive/PaperSense-Intelligent-Document-Platform"
RAW_DATA_DIR = os.path.join(PROJECT_ROOT, "datasets", "raw", "funsd")
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT, "datasets", "processed", "funsd")
RESULTS_DIR = os.path.join(PROJECT_ROOT, "results", "visualizations", "layout")

# FUNSD entity labels
ENTITY_LABELS = ['O', 'B-HEADER', 'I-HEADER', 'B-QUESTION', 'I-QUESTION', 'B-ANSWER', 'I-ANSWER', 'B-OTHER', 'I-OTHER']
ENTITY_TYPES = ['HEADER', 'QUESTION', 'ANSWER', 'OTHER']

def ensure_dir(path):
    """Create directory if it doesn't exist"""
    os.makedirs(path, exist_ok=True)
    return path

def convert_numpy_types(obj):
    """Convert numpy types to Python native types for JSON serialization"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    elif isinstance(obj, tuple):
        return tuple(convert_numpy_types(item) for item in obj)
    else:
        return obj

def download_funsd_dataset():
    """Download FUNSD dataset"""
    print("📥 Downloading FUNSD dataset...")

    # FUNSD dataset URL (GitHub repository)
    dataset_url = "https://guillaumejaume.github.io/FUNSD/dataset.zip"

    ensure_dir(RAW_DATA_DIR)

    try:
        # Download dataset
        response = requests.get(dataset_url, stream=True)
        zip_path = os.path.join(RAW_DATA_DIR, "funsd_dataset.zip")

        with open(zip_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        # Extract dataset
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(RAW_DATA_DIR)

        # Remove zip file
        os.remove(zip_path)

        print(f"✅ Dataset downloaded and extracted to: {RAW_DATA_DIR}")
        return True
    except Exception as e:
        print(f"❌ Error downloading dataset: {e}")
        # Create dummy structure for demo
        create_demo_structure()
        return True

def create_demo_structure():
    """Create demo FUNSD structure for testing"""
    print("🔧 Creating demo FUNSD structure...")

    # Create directories
    for split in ['training_data', 'testing_data']:
        ensure_dir(os.path.join(RAW_DATA_DIR, "dataset", split, "images"))
        ensure_dir(os.path.join(RAW_DATA_DIR, "dataset", split, "annotations"))

    # Create sample annotations
    sample_annotation = {
        "form": [
            {
                "id": 0,
                "text": "SAMPLE FORM",
                "box": [100, 50, 300, 80],
                "label": "header",
                "words": [
                    {"text": "SAMPLE", "box": [100, 50, 150, 80]},
                    {"text": "FORM", "box": [160, 50, 300, 80]}
                ]
            },
            {
                "id": 1,
                "text": "Name:",
                "box": [50, 120, 100, 140],
                "label": "question",
                "words": [{"text": "Name:", "box": [50, 120, 100, 140]}]
            },
            {
                "id": 2,
                "text": "John Doe",
                "box": [120, 120, 200, 140],
                "label": "answer",
                "words": [
                    {"text": "John", "box": [120, 120, 150, 140]},
                    {"text": "Doe", "box": [160, 120, 200, 140]}
                ]
            }
        ]
    }

    # Save sample annotations
    for split in ['training_data', 'testing_data']:
        num_samples = 10 if split == 'testing_data' else 30
        for i in range(num_samples):
            # Save annotation
            with open(os.path.join(RAW_DATA_DIR, "dataset", split, "annotations", f"sample_{i:04d}.json"), 'w') as f:
                json.dump(sample_annotation, f, indent=2)

def load_funsd_dataset():
    """Load FUNSD dataset"""
    print("📂 Loading FUNSD dataset...")

    dataset_path = os.path.join(RAW_DATA_DIR, "dataset")

    if not os.path.exists(dataset_path):
        print("📥 Dataset not found, downloading...")
        download_funsd_dataset()

    dataset = {}

    # Load training and testing data
    for split_name, folder_name in [('train', 'training_data'), ('test', 'testing_data')]:
        annotations_dir = os.path.join(dataset_path, folder_name, "annotations")
        images_dir = os.path.join(dataset_path, folder_name, "images")

        if not os.path.exists(annotations_dir):
            print(f"⚠ {annotations_dir} not found, creating sample data...")
            create_demo_structure()

        annotations = []
        if os.path.exists(annotations_dir):
            for annotation_file in os.listdir(annotations_dir):
                if annotation_file.endswith('.json'):
                    with open(os.path.join(annotations_dir, annotation_file), 'r') as f:
                        annotation_data = json.load(f)
                        annotation_data['file_name'] = annotation_file.replace('.json', '.png')
                        annotations.append(annotation_data)

        dataset[split_name] = annotations
        print(f"  {split_name}: {len(annotations)} samples")

    return dataset

def analyze_layout_statistics(dataset):
    """Analyze layout and entity statistics"""
    print("📊 Analyzing layout statistics...")

    ensure_dir(RESULTS_DIR)

    # Collect statistics
    entity_counts = Counter()
    box_widths = []
    box_heights = []
    box_areas = []
    text_lengths = []
    words_per_entity = []
    entities_per_form = []

    all_forms = []
    for split_name, split_data in dataset.items():
        all_forms.extend(split_data)

    print(f"Analyzing {len(all_forms)} forms...")

    for form_data in tqdm(all_forms, desc="Analyzing forms"):
        form_entities = form_data.get('form', [])
        entities_per_form.append(len(form_entities))

        for entity in form_entities:
            # Entity type statistics
            entity_label = entity.get('label', 'other').upper()
            entity_counts[entity_label] += 1

            # Bounding box statistics
            box = entity.get('box', [0, 0, 100, 20])
            if len(box) == 4:
                x1, y1, x2, y2 = box
                width = x2 - x1
                height = y2 - y1
                area = width * height

                box_widths.append(width)
                box_heights.append(height)
                box_areas.append(area)

            # Text statistics
            text = entity.get('text', '')
            text_lengths.append(len(text))

            # Word count statistics
            words = entity.get('words', [])
            words_per_entity.append(len(words))

    # Create visualizations
    fig, axes = plt.subplots(3, 3, figsize=(18, 15))

    # Entity type distribution
    entity_types = list(entity_counts.keys())
    entity_type_counts = list(entity_counts.values())
    axes[0,0].bar(entity_types, entity_type_counts, color='skyblue')
    axes[0,0].set_title('Entity Type Distribution')
    axes[0,0].set_xlabel('Entity Type')
    axes[0,0].set_ylabel('Count')
    axes[0,0].tick_params(axis='x', rotation=45)

    # Box width distribution
    axes[0,1].hist(box_widths, bins=30, alpha=0.7, color='green', edgecolor='black')
    axes[0,1].set_title('Bounding Box Width Distribution')
    axes[0,1].set_xlabel('Width (pixels)')
    axes[0,1].set_ylabel('Frequency')
    axes[0,1].axvline(np.mean(box_widths), color='red', linestyle='--',
                     label=f'Mean: {np.mean(box_widths):.1f}')
    axes[0,1].legend()

    # Box height distribution
    axes[0,2].hist(box_heights, bins=30, alpha=0.7, color='orange', edgecolor='black')
    axes[0,2].set_title('Bounding Box Height Distribution')
    axes[0,2].set_xlabel('Height (pixels)')
    axes[0,2].set_ylabel('Frequency')
    axes[0,2].axvline(np.mean(box_heights), color='red', linestyle='--',
                     label=f'Mean: {np.mean(box_heights):.1f}')
    axes[0,2].legend()

    # Box area distribution
    axes[1,0].hist(box_areas, bins=30, alpha=0.7, color='purple', edgecolor='black')
    axes[1,0].set_title('Bounding Box Area Distribution')
    axes[1,0].set_xlabel('Area (pixels²)')
    axes[1,0].set_ylabel('Frequency')
    axes[1,0].axvline(np.mean(box_areas), color='red', linestyle='--',
                     label=f'Mean: {np.mean(box_areas):.0f}')
    axes[1,0].legend()

    # Text length distribution
    axes[1,1].hist(text_lengths, bins=30, alpha=0.7, color='brown', edgecolor='black')
    axes[1,1].set_title('Text Length Distribution')
    axes[1,1].set_xlabel('Text Length (characters)')
    axes[1,1].set_ylabel('Frequency')
    axes[1,1].axvline(np.mean(text_lengths), color='red', linestyle='--',
                     label=f'Mean: {np.mean(text_lengths):.1f}')
    axes[1,1].legend()

    # Words per entity distribution
    axes[1,2].hist(words_per_entity, bins=20, alpha=0.7, color='pink', edgecolor='black')
    axes[1,2].set_title('Words per Entity Distribution')
    axes[1,2].set_xlabel('Number of Words')
    axes[1,2].set_ylabel('Frequency')
    axes[1,2].axvline(np.mean(words_per_entity), color='red', linestyle='--',
                     label=f'Mean: {np.mean(words_per_entity):.1f}')
    axes[1,2].legend()

    # Entities per form distribution
    axes[2,0].hist(entities_per_form, bins=20, alpha=0.7, color='cyan', edgecolor='black')
    axes[2,0].set_title('Entities per Form Distribution')
    axes[2,0].set_xlabel('Number of Entities')
    axes[2,0].set_ylabel('Frequency')
    axes[2,0].axvline(np.mean(entities_per_form), color='red', linestyle='--',
                     label=f'Mean: {np.mean(entities_per_form):.1f}')
    axes[2,0].legend()

    # Entity type vs average text length
    entity_text_lengths = defaultdict(list)
    for form_data in all_forms:
        for entity in form_data.get('form', []):
            label = entity.get('label', 'other').upper()
            text_len = len(entity.get('text', ''))
            entity_text_lengths[label].append(text_len)

    avg_text_lengths = {label: np.mean(lengths) for label, lengths in entity_text_lengths.items()}
    labels = list(avg_text_lengths.keys())
    avg_lengths = list(avg_text_lengths.values())

    axes[2,1].bar(labels, avg_lengths, color='lightcoral')
    axes[2,1].set_title('Average Text Length by Entity Type')
    axes[2,1].set_xlabel('Entity Type')
    axes[2,1].set_ylabel('Average Text Length')
    axes[2,1].tick_params(axis='x', rotation=45)

    # Split distribution
    split_counts = {name: len(data) for name, data in dataset.items()}
    axes[2,2].pie(split_counts.values(), labels=split_counts.keys(), autopct='%1.1f%%',
                 colors=['lightblue', 'lightgreen'])
    axes[2,2].set_title('Train/Test Split Distribution')

    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'layout_statistics.png'), dpi=300, bbox_inches='tight')
    plt.close()

    # Statistics summary
    stats = {
        'total_forms': len(all_forms),
        'total_entities': sum(entity_counts.values()),
        'entity_distribution': convert_numpy_types(dict(entity_counts)),
        'bounding_box_stats': {
            'width': {
                'mean': float(np.mean(box_widths)),
                'std': float(np.std(box_widths)),
                'min': int(np.min(box_widths)),
                'max': int(np.max(box_widths))
            },
            'height': {
                'mean': float(np.mean(box_heights)),
                'std': float(np.std(box_heights)),
                'min': int(np.min(box_heights)),
                'max': int(np.max(box_heights))
            },
            'area': {
                'mean': float(np.mean(box_areas)),
                'std': float(np.std(box_areas))
            }
        },
        'text_stats': {
            'avg_text_length': float(np.mean(text_lengths)),
            'avg_words_per_entity': float(np.mean(words_per_entity)),
            'avg_entities_per_form': float(np.mean(entities_per_form))
        }
    }

    print(f"📈 Layout Statistics:")
    print(f"  Total forms: {stats['total_forms']:,}")
    print(f"  Total entities: {stats['total_entities']:,}")
    print(f"  Average entities per form: {stats['text_stats']['avg_entities_per_form']:.1f}")
    print(f"  Average text length: {stats['text_stats']['avg_text_length']:.1f} characters")

    return stats

def preprocess_layout_data(dataset, target_size=(224, 224), max_sequence_length=512):
    """Preprocess FUNSD data for layout understanding tasks"""
    print(f"🖼 Preprocessing layout data...")

    processed_data = {}

    for split_name, split_data in dataset.items():
        print(f"Processing {len(split_data)} {split_name} samples...")

        processed_samples = []

        for form_idx, form_data in enumerate(tqdm(split_data, desc=f"Processing {split_name}")):
            # Create dummy image (in real implementation, load actual image)
            image_width, image_height = 800, 1000  # Typical document dimensions
            dummy_image = np.random.randint(200, 255, (image_height, image_width, 3), dtype=np.uint8)

            # Resize image
            image = Image.fromarray(dummy_image)
            image_resized = image.resize(target_size, Image.Resampling.LANCZOS)
            image_array = np.array(image_resized).astype(np.float32) / 255.0

            # Process entities
            entities = []
            sequence_tokens = []
            sequence_labels = []
            bounding_boxes = []

            scale_x = target_size[0] / image_width
            scale_y = target_size[1] / image_height

            for entity in form_data.get('form', []):
                entity_text = entity.get('text', '')
                entity_label = entity.get('label', 'other').upper()
                entity_box = entity.get('box', [0, 0, 100, 20])

                # Scale bounding box to new image size
                if len(entity_box) == 4:
                    x1, y1, x2, y2 = entity_box
                    scaled_box = [
                        int(x1 * scale_x),
                        int(y1 * scale_y),
                        int(x2 * scale_x),
                        int(y2 * scale_y)
                    ]
                else:
                    scaled_box = [0, 0, 50, 20]

                # Process words
                words = entity.get('words', [])
                if not words and entity_text:
                    # Create dummy words if not provided
                    words = [{'text': word, 'box': scaled_box} for word in entity_text.split()]

                # Create entity data
                entity_data = {
                    'text': entity_text,
                    'label': entity_label,
                    'box': scaled_box,
                    'words': words,
                    'entity_id': len(entities)
                }
                entities.append(entity_data)

                # Create sequence data for token classification
                words_in_entity = entity_text.split() if entity_text else []
                for word_idx, word in enumerate(words_in_entity):
                    sequence_tokens.append(word)

                    # BIO tagging
                    if word_idx == 0:
                        sequence_labels.append(f'B-{entity_label}')
                    else:
                        sequence_labels.append(f'I-{entity_label}')

                    # Use entity bounding box for all words (simplified)
                    bounding_boxes.append(scaled_box)

                # Limit sequence length
                if len(sequence_tokens) >= max_sequence_length:
                    break

            # Pad sequences
            while len(sequence_tokens) < max_sequence_length:
                sequence_tokens.append('[PAD]')
                sequence_labels.append('O')
                bounding_boxes.append([0, 0, 0, 0])

            # Truncate if too long
            sequence_tokens = sequence_tokens[:max_sequence_length]
            sequence_labels = sequence_labels[:max_sequence_length]
            bounding_boxes = bounding_boxes[:max_sequence_length]

            processed_sample = {
                'image': image_array,
                'entities': entities,
                'sequence_tokens': sequence_tokens,
                'sequence_labels': sequence_labels,
                'bounding_boxes': bounding_boxes,
                'form_id': form_idx,
                'file_name': form_data.get('file_name', f'form_{form_idx:04d}.png'),
                'original_size': (image_width, image_height),
                'processed_size': target_size
            }

            processed_samples.append(processed_sample)

        processed_data[split_name] = processed_samples
        print(f"✅ {split_name}: {len(processed_samples)} samples processed")

    return processed_data

def create_layout_visualization(processed_data, num_examples=3):
    """Create visualization of layout understanding"""
    print("🎨 Creating layout visualization...")

    train_samples = processed_data['train'][:num_examples]

    fig, axes = plt.subplots(num_examples, 2, figsize=(12, 4*num_examples))
    if num_examples == 1:
        axes = axes.reshape(1, -1)

    colors = {
        'HEADER': 'red',
        'QUESTION': 'blue',
        'ANSWER': 'green',
        'OTHER': 'orange'
    }

    for i, sample in enumerate(train_samples):
        # Original image
        axes[i, 0].imshow(sample['image'])
        axes[i, 0].set_title(f'Form {sample["form_id"]} - Original')
        axes[i, 0].axis('off')

        # Image with bounding boxes
        image_with_boxes = sample['image'].copy()
        fig_temp, ax_temp = plt.subplots(figsize=(8, 6))
        ax_temp.imshow(image_with_boxes)

        for entity in sample['entities']:
            box = entity['box']
            label = entity['label']
            color = colors.get(label, 'gray')

            # Create rectangle
            rect = patches.Rectangle(
                (box[0], box[1]), box[2]-box[0], box[3]-box[1],
                linewidth=2, edgecolor=color, facecolor='none'
            )
            ax_temp.add_patch(rect)

            # Add text label
            ax_temp.text(box[0], box[1]-5, f'{label}: {entity["text"][:20]}...',
                        fontsize=8, color=color, fontweight='bold')

        ax_temp.set_title(f'Form {sample["form_id"]} - With Annotations')
        ax_temp.axis('off')

        # Save temp figure and display
        temp_path = f'/tmp/temp_layout_{i}.png'
        plt.savefig(temp_path, bbox_inches='tight', dpi=100)
        plt.close(fig_temp)

        # Load and display in main figure
        temp_img = plt.imread(temp_path)
        axes[i, 1].imshow(temp_img)
        axes[i, 1].axis('off')

    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'layout_visualization.png'), dpi=300, bbox_inches='tight')
    plt.close()

def create_entity_statistics_visualization(processed_data):
    """Create detailed entity statistics"""
    print("📊 Creating entity statistics visualization...")

    # Collect all entities
    all_entities = []
    for split_data in processed_data.values():
        for sample in split_data:
            all_entities.extend(sample['entities'])

    # Entity type distribution
    entity_types = [entity['label'] for entity in all_entities]
    entity_type_counts = Counter(entity_types)

    # Text length by entity type
    entity_text_lengths = defaultdict(list)
    for entity in all_entities:
        entity_text_lengths[entity['label']].append(len(entity['text']))

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    # Entity type distribution
    types = list(entity_type_counts.keys())
    counts = list(entity_type_counts.values())
    bars = axes[0,0].bar(types, counts, color=['red', 'blue', 'green', 'orange'])
    axes[0,0].set_title('Entity Type Distribution')
    axes[0,0].set_xlabel('Entity Type')
    axes[0,0].set_ylabel('Count')

    # Add count labels on bars
    for bar, count in zip(bars, counts):
        axes[0,0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                      str(count), ha='center', va='bottom')

    # Average text length by entity type
    avg_lengths = [np.mean(entity_text_lengths[entity_type]) for entity_type in types]
    axes[0,1].bar(types, avg_lengths, color=['red', 'blue', 'green', 'orange'])
    axes[0,1].set_title('Average Text Length by Entity Type')
    axes[0,1].set_xlabel('Entity Type')
    axes[0,1].set_ylabel('Average Characters')

    # Text length distribution
    all_text_lengths = [len(entity['text']) for entity in all_entities]
    axes[1,0].hist(all_text_lengths, bins=30, alpha=0.7, color='purple', edgecolor='black')
    axes[1,0].set_title('Text Length Distribution (All Entities)')
    axes[1,0].set_xlabel('Text Length (characters)')
    axes[1,0].set_ylabel('Frequency')
    axes[1,0].axvline(np.mean(all_text_lengths), color='red', linestyle='--',
                     label=f'Mean: {np.mean(all_text_lengths):.1f}')
    axes[1,0].legend()

    # Bounding box size distribution
    box_areas = []
    for entity in all_entities:
        box = entity['box']
        if len(box) == 4:
            area = (box[2] - box[0]) * (box[3] - box[1])
            box_areas.append(area)

    axes[1,1].hist(box_areas, bins=30, alpha=0.7, color='brown', edgecolor='black')
    axes[1,1].set_title('Bounding Box Area Distribution')
    axes[1,1].set_xlabel('Area (pixels²)')
    axes[1,1].set_ylabel('Frequency')
    axes[1,1].axvline(np.mean(box_areas), color='red', linestyle='--',
                     label=f'Mean: {np.mean(box_areas):.0f}')
    axes[1,1].legend()

    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'entity_statistics.png'), dpi=300, bbox_inches='tight')
    plt.close()

def save_processed_data(processed_data, layout_stats):
    """Save processed FUNSD data"""
    print("💾 Saving processed data...")

    ensure_dir(PROCESSED_DATA_DIR)

    # Save processed data splits
    for split_name, split_data in processed_data.items():
        with open(os.path.join(PROCESSED_DATA_DIR, f"{split_name}_processed.pkl"), 'wb') as f:
            pickle.dump(split_data, f)

    # Save label information
    label_info = {
        'entity_labels': ENTITY_LABELS,
        'entity_types': ENTITY_TYPES,
        'num_labels': len(ENTITY_LABELS),
        'label_to_id': {label: idx for idx, label in enumerate(ENTITY_LABELS)},
        'id_to_label': {idx: label for idx, label in enumerate(ENTITY_LABELS)}
    }

    with open(os.path.join(PROCESSED_DATA_DIR, 'label_info.json'), 'w') as f:
        json.dump(label_info, f, indent=2)

    # Save metadata
    metadata = {
        'dataset_name': 'FUNSD',
        'task': 'form_understanding',
        'subtasks': ['layout_analysis', 'entity_recognition', 'entity_linking'],
        'num_labels': len(ENTITY_LABELS),
        'entity_types': ENTITY_TYPES,
        'splits': {name: len(data) for name, data in processed_data.items()},
        'preprocessing': {
            'target_size': [224, 224],
            'max_sequence_length': 512,
            'normalization': 'min_max_0_1',
            'channels': 3
        },
        'layout_statistics': convert_numpy_types(layout_stats)
    }

    metadata = convert_numpy_types(metadata)

    with open(os.path.join(PROCESSED_DATA_DIR, 'metadata.json'), 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"✅ Data saved to {PROCESSED_DATA_DIR}")

    # Print summary
    total_samples = sum(len(data) for data in processed_data.values())
    total_entities = sum(len(sample['entities']) for split_data in processed_data.values()
                        for sample in split_data)

    print(f"\n📋 Processing Summary:")
    print(f"  Total samples: {total_samples:,}")
    print(f"  Total entities: {total_entities:,}")
    for split_name, data in processed_data.items():
        print(f"  {split_name}: {len(data):,} samples")

def main(target_size=(224, 224), max_sequence_length=512, create_visualizations=True):
    """Main FUNSD preprocessing function"""
    print("🚀 Starting FUNSD Preprocessing...")
    print("=" * 50)

    # Create directories
    ensure_dir(PROCESSED_DATA_DIR)
    ensure_dir(RESULTS_DIR)

    try:
        # Load dataset
        dataset = load_funsd_dataset()

        # Analyze layout statistics
        if create_visualizations:
            layout_stats = analyze_layout_statistics(dataset)
        else:
            layout_stats = {}

        # Preprocess data
        processed_data = preprocess_layout_data(
            dataset, target_size, max_sequence_length
        )

        # Create visualizations
        if create_visualizations and processed_data:
            create_layout_visualization(processed_data)
            create_entity_statistics_visualization(processed_data)

        # Save processed data
        if processed_data:
            save_processed_data(processed_data, layout_stats)
            print("🎉 FUNSD preprocessing completed successfully!")
            return True
        else:
            print("❌ No data was processed!")
            return False

    except Exception as e:
        print(f"❌ Error during preprocessing: {e}")
        import traceback
        traceback.print_exc()
        return False

if __name__ == "__main__":
    success = main(
        target_size=(224, 224),
        max_sequence_length=512,
        create_visualizations=True
    )

    if success:
        print("\n✅ Preprocessing completed! You can now use the processed data for training.")
        print(f"📁 Raw data: {RAW_DATA_DIR}")
        print(f"📁 Processed data: {PROCESSED_DATA_DIR}")
        print(f"📁 Visualizations: {RESULTS_DIR}")
    else:
        print("\n❌ Preprocessing failed! Check the error messages above.")

🚀 Starting FUNSD Preprocessing...
📂 Loading FUNSD dataset...
📥 Dataset not found, downloading...
📥 Downloading FUNSD dataset...
✅ Dataset downloaded and extracted to: /content/drive/MyDrive/PaperSense-Intelligent-Document-Platform/datasets/raw/funsd
  train: 149 samples
  test: 50 samples
📊 Analyzing layout statistics...
Analyzing 199 forms...


Analyzing forms: 100%|██████████| 199/199 [00:00<00:00, 26314.40it/s]


📈 Layout Statistics:
  Total forms: 199
  Total entities: 9,743
  Average entities per form: 49.0
  Average text length: 18.1 characters
🖼 Preprocessing layout data...
Processing 149 train samples...


Processing train: 100%|██████████| 149/149 [00:02<00:00, 52.50it/s]


✅ train: 149 samples processed
Processing 50 test samples...


Processing test: 100%|██████████| 50/50 [00:00<00:00, 55.35it/s]


✅ test: 50 samples processed
🎨 Creating layout visualization...
📊 Creating entity statistics visualization...
💾 Saving processed data...
✅ Data saved to /content/drive/MyDrive/PaperSense-Intelligent-Document-Platform/datasets/processed/funsd

📋 Processing Summary:
  Total samples: 199
  Total entities: 9,743
  train: 149 samples
  test: 50 samples
🎉 FUNSD preprocessing completed successfully!

✅ Preprocessing completed! You can now use the processed data for training.
📁 Raw data: /content/drive/MyDrive/PaperSense-Intelligent-Document-Platform/datasets/raw/funsd
📁 Processed data: /content/drive/MyDrive/PaperSense-Intelligent-Document-Platform/datasets/processed/funsd
📁 Visualizations: /content/drive/MyDrive/PaperSense-Intelligent-Document-Platform/results/visualizations/layout
