# 01 - Data Exploration

Explore the CORD v2 and FUNSD datasets to understand structure, distributions, and quality.

In [None]:
import sys
sys.path.insert(0, '..')

import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datasets import load_dataset

from src.data.cord_loader import load_cord_dataset, parse_cord_ground_truth, flatten_cord_fields
from src.data.dataset_builder import get_dataset_stats

sns.set_theme(style='whitegrid')
%matplotlib inline

## CORD v2 Dataset

In [None]:
# Load raw CORD dataset
cord_raw = load_dataset('naver-clova-ix/cord-v2')
print(cord_raw)
print(f"\nTrain: {len(cord_raw['train'])} | Val: {len(cord_raw['validation'])} | Test: {len(cord_raw['test'])}")

In [None]:
# Examine a single sample
sample = cord_raw['train'][0]
print('Keys:', list(sample.keys()))
print('\nImage size:', sample['image'].size)

gt = json.loads(sample['ground_truth'])
print('\nGround truth structure:')
print(json.dumps(gt, indent=2, ensure_ascii=False)[:1000])

In [None]:
# Show sample receipt images
fig, axes = plt.subplots(1, 4, figsize=(20, 8))
for i, ax in enumerate(axes):
    ax.imshow(cord_raw['train'][i]['image'])
    ax.set_title(f'Sample {i}')
    ax.axis('off')
plt.suptitle('CORD v2 - Sample Receipt Images', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Analyze field distributions across the training set
field_counts = Counter()
superclass_counts = Counter()
num_fields_per_sample = []

for item in cord_raw['train']:
    gt = parse_cord_ground_truth(item['ground_truth'])
    flat = flatten_cord_fields(gt)
    num_fields_per_sample.append(len(flat))
    
    for key in flat:
        superclass = key.split('.')[0]
        superclass_counts[superclass] += 1
        field_type = '.'.join(key.split('.')[:2]) if '.' in key else key
        # Remove index for menu items
        parts = key.split('.')
        if len(parts) >= 3 and parts[1].isdigit():
            field_type = f'{parts[0]}.{parts[2]}'
        field_counts[field_type] += 1

print(f'Total samples: {len(cord_raw["train"])}')
print(f'Avg fields/sample: {sum(num_fields_per_sample)/len(num_fields_per_sample):.1f}')
print(f'Min fields: {min(num_fields_per_sample)}, Max fields: {max(num_fields_per_sample)}')

In [None]:
# Superclass distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Superclass bar chart
sc_names, sc_vals = zip(*superclass_counts.most_common())
ax1.barh(sc_names, sc_vals, color='steelblue')
ax1.set_xlabel('Count')
ax1.set_title('Fields by Superclass')

# Fields per sample histogram
ax2.hist(num_fields_per_sample, bins=30, color='steelblue', edgecolor='white')
ax2.set_xlabel('Number of Fields')
ax2.set_ylabel('Number of Samples')
ax2.set_title('Fields per Sample Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Top 20 most common field types
top_fields = field_counts.most_common(20)
names, counts = zip(*top_fields)

fig, ax = plt.subplots(figsize=(12, 8))
ax.barh(range(len(names)), counts, color='steelblue')
ax.set_yticks(range(len(names)))
ax.set_yticklabels(names)
ax.set_xlabel('Frequency')
ax.set_title('Top 20 Field Types in CORD v2')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Use our loader and verify stats
cord_samples = load_cord_dataset('train', max_samples=50)
stats = get_dataset_stats(cord_samples)
print('Dataset stats (first 50 samples):')
for k, v in stats.items():
    print(f'  {k}: {v}')

## Key Observations

1. **Menu fields dominate** — most receipts have multiple menu items with name, price, quantity
2. **Total/subtotal are common** — nearly every receipt has total_price, many have tax/discount
3. **Sparse fields exist** — void_menu, emoneyprice are rare (potential challenge for extraction)
4. **Variable complexity** — field count ranges widely, some receipts are simple (3 fields), others complex (30+)
5. **Image quality varies** — some receipts are clear photos, others are noisy scans