# Data Exploration Notebook
## AI Document Intelligence - Week 1

This notebook explores the synthetic invoice dataset.

## Setup

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image

from src.data_collection import DatasetOrganizer
from src.utils.file_utils import read_json, list_files

# Configure matplotlib
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

## Load Dataset

In [None]:
# Initialize dataset organizer
dataset_path = Path.cwd().parent / "dataset"
organizer = DatasetOrganizer(base_path=str(dataset_path))

# Get statistics
stats = organizer.get_stats()
print("Dataset Statistics:")
print("="*50)
for key, value in stats.items():
    print(f"{key}: {value}")

## Explore Labels

In [None]:
# Load all labels
labels_dir = dataset_path / "labels"
label_files = list_files(labels_dir, pattern="*.json", recursive=False)
label_files = [f for f in label_files if not f.name.endswith("_metadata.json")]

print(f"Found {len(label_files)} label files")

# Load sample labels
if label_files:
    sample_label = read_json(label_files[0])
    print("\nSample Label:")
    print("="*50)
    for key, value in sample_label.items():
        if key != "line_items":
            print(f"{key}: {value}")

## Analyze Label Statistics

In [None]:
# Collect statistics from all labels
currencies = []
total_amounts = []
num_line_items = []
tax_rates = []

for label_file in label_files[:100]:  # Sample first 100
    try:
        label = read_json(label_file)
        currencies.append(label.get('currency', 'USD'))
        total_amounts.append(label.get('total_amount', 0))
        num_line_items.append(len(label.get('line_items', [])))
        tax_rates.append(label.get('tax_rate', 0))
    except:
        continue

# Create DataFrame
df = pd.DataFrame({
    'currency': currencies,
    'total_amount': total_amounts,
    'num_line_items': num_line_items,
    'tax_rate': tax_rates
})

print("\nDataset Summary:")
print(df.describe())

## Visualize Distributions

In [None]:
# Plot distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Currency distribution
df['currency'].value_counts().plot(kind='bar', ax=axes[0, 0], title='Currency Distribution')
axes[0, 0].set_ylabel('Count')

# Total amount distribution
df['total_amount'].hist(bins=30, ax=axes[0, 1])
axes[0, 1].set_title('Total Amount Distribution')
axes[0, 1].set_xlabel('Amount')
axes[0, 1].set_ylabel('Frequency')

# Line items distribution
df['num_line_items'].value_counts().sort_index().plot(kind='bar', ax=axes[1, 0], title='Number of Line Items')
axes[1, 0].set_ylabel('Count')

# Tax rate distribution
df['tax_rate'].value_counts().sort_index().plot(kind='bar', ax=axes[1, 1], title='Tax Rate Distribution')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Display Sample Invoices

In [None]:
# Display sample invoices
raw_dir = dataset_path / "raw"
image_files = list_files(raw_dir, pattern="*.png", recursive=False)

if len(image_files) >= 4:
    fig, axes = plt.subplots(2, 2, figsize=(14, 14))
    axes = axes.flatten()
    
    for i, img_file in enumerate(image_files[:4]):
        img = Image.open(img_file)
        axes[i].imshow(img)
        axes[i].set_title(f"Invoice {i+1}: {img_file.name}")
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print("Not enough images to display. Generate dataset first.")

## Conclusion

This notebook explored the synthetic invoice dataset:
- Dataset contains diverse invoices with various currencies, amounts, and line items
- Tax rates and pricing are realistic
- Images are suitable for OCR testing

Next: Run OCR baseline testing in notebook 02