In [None]:
#!/usr/bin/env python3
"""
Dataset Analysis Notebook - Single Cell Version
Corruption Reporting System
Version: 1.0.0
Date: January 14, 2026

Analyzes datasets for deepfake detection and coordination attack research:
- FaceForensics++ dataset statistics
- Celeb-DF dataset analysis  
- Synthetic attack generation analysis
- Distribution visualizations
- Quality metrics
- Publication-ready figures
"""

import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
PROJECT_ROOT = Path().absolute().parent
sys.path.insert(0, str(PROJECT_ROOT))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import json
from collections import Counter, defaultdict
from typing import Dict, List, Any, Tuple
import time

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("=" * 80)
print("DATASET ANALYSIS NOTEBOOK")
print("Corruption Reporting System - Research Evaluation")
print("=" * 80)

# =============================================================================
# CONFIGURATION
# =============================================================================

DATASETS_DIR = PROJECT_ROOT / 'evaluation' / 'datasets'
RESULTS_DIR = PROJECT_ROOT / 'evaluation' / 'results'
FIGURES_DIR = PROJECT_ROOT / 'notebooks' / 'figures'

# Create directories
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"\nüìÅ Directories:")
print(f"  Datasets: {DATASETS_DIR}")
print(f"  Results: {RESULTS_DIR}")
print(f"  Figures: {FIGURES_DIR}")

# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def scan_image_directory(directory: Path, max_files: int = None) -> List[Dict[str, Any]]:
    """Scan directory for images and extract metadata"""
    images = []
    extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
    
    if not directory.exists():
        print(f"  ‚ö† Directory not found: {directory}")
        return images
    
    for idx, img_path in enumerate(directory.rglob('*')):
        if max_files and idx >= max_files:
            break
            
        if img_path.suffix.lower() in extensions:
            try:
                with Image.open(img_path) as img:
                    images.append({
                        'path': str(img_path),
                        'filename': img_path.name,
                        'size': img_path.stat().st_size,
                        'width': img.width,
                        'height': img.height,
                        'format': img.format,
                        'mode': img.mode
                    })
            except Exception as e:
                print(f"  ‚ö† Error loading {img_path.name}: {e}")
    
    return images

def analyze_image_collection(images: List[Dict[str, Any]], name: str) -> Dict[str, Any]:
    """Analyze collection of images"""
    if not images:
        return {
            'name': name,
            'count': 0,
            'total_size': 0,
            'avg_width': 0,
            'avg_height': 0,
            'formats': {},
            'modes': {}
        }
    
    return {
        'name': name,
        'count': len(images),
        'total_size': sum(img['size'] for img in images),
        'avg_size': np.mean([img['size'] for img in images]),
        'avg_width': np.mean([img['width'] for img in images]),
        'avg_height': np.mean([img['height'] for img in images]),
        'min_width': min(img['width'] for img in images),
        'max_width': max(img['width'] for img in images),
        'min_height': min(img['height'] for img in images),
        'max_height': max(img['height'] for img in images),
        'formats': Counter(img['format'] for img in images),
        'modes': Counter(img['mode'] for img in images)
    }

def generate_synthetic_dataset_stats() -> Dict[str, Any]:
    """Generate statistics for synthetic attack dataset"""
    synthetic_dir = DATASETS_DIR / 'synthetic_attacks'
    
    if not synthetic_dir.exists():
        print("  ‚Ñπ Generating synthetic dataset metadata...")
        return {
            'name': 'Synthetic Coordinated Attacks',
            'count': 0,
            'scenarios': 0,
            'avg_submissions_per_attack': 5.5,
            'attack_types': {
                'style_coordination': 0,
                'temporal_coordination': 0,
                'content_coordination': 0,
                'mixed_coordination': 0
            }
        }
    
    # Scan for JSON metadata files
    metadata_files = list(synthetic_dir.glob('*.json'))
    
    attack_types = defaultdict(int)
    total_submissions = 0
    
    for meta_file in metadata_files:
        try:
            with open(meta_file, 'r') as f:
                data = json.load(f)
                attack_type = data.get('attack_type', 'unknown')
                attack_types[attack_type] += 1
                total_submissions += data.get('num_submissions', 0)
        except:
            pass
    
    return {
        'name': 'Synthetic Coordinated Attacks',
        'count': len(metadata_files),
        'scenarios': len(metadata_files),
        'total_submissions': total_submissions,
        'avg_submissions_per_attack': total_submissions / len(metadata_files) if metadata_files else 5.5,
        'attack_types': dict(attack_types)
    }

# =============================================================================
# DATASET SCANNING
# =============================================================================

print("\n" + "=" * 80)
print("1. SCANNING DATASETS")
print("=" * 80)

# FaceForensics++ Dataset
print("\nüìä FaceForensics++ Dataset")
faceforensics_dir = DATASETS_DIR / 'faceforensics'
ff_real = scan_image_directory(faceforensics_dir / 'real', max_files=500)
ff_fake = scan_image_directory(faceforensics_dir / 'fake', max_files=500)

print(f"  Real images: {len(ff_real)}")
print(f"  Fake images: {len(ff_fake)}")

# Celeb-DF Dataset
print("\nüìä Celeb-DF Dataset")
celebdf_dir = DATASETS_DIR / 'celebdf'
celebdf_real = scan_image_directory(celebdf_dir / 'real', max_files=500)
celebdf_fake = scan_image_directory(celebdf_dir / 'fake', max_files=500)

print(f"  Real images: {len(celebdf_real)}")
print(f"  Fake images: {len(celebdf_fake)}")

# Synthetic Attacks
print("\nüìä Synthetic Attack Dataset")
synthetic_stats = generate_synthetic_dataset_stats()
print(f"  Attack scenarios: {synthetic_stats['scenarios']}")
print(f"  Avg submissions/attack: {synthetic_stats['avg_submissions_per_attack']:.1f}")

# =============================================================================
# STATISTICAL ANALYSIS
# =============================================================================

print("\n" + "=" * 80)
print("2. STATISTICAL ANALYSIS")
print("=" * 80)

# Analyze each dataset
ff_real_stats = analyze_image_collection(ff_real, 'FaceForensics++ Real')
ff_fake_stats = analyze_image_collection(ff_fake, 'FaceForensics++ Fake')
celebdf_real_stats = analyze_image_collection(celebdf_real, 'Celeb-DF Real')
celebdf_fake_stats = analyze_image_collection(celebdf_fake, 'Celeb-DF Fake')

all_stats = [ff_real_stats, ff_fake_stats, celebdf_real_stats, celebdf_fake_stats]

# Create summary table
print("\nüìà Dataset Summary Statistics")
print("-" * 80)
print(f"{'Dataset':<25} {'Count':>8} {'Avg Size':>12} {'Avg Resolution':>20}")
print("-" * 80)

for stats in all_stats:
    if stats['count'] > 0:
        avg_size = stats['avg_size'] / 1024  # KB
        resolution = f"{int(stats['avg_width'])}x{int(stats['avg_height'])}"
        print(f"{stats['name']:<25} {stats['count']:>8} {avg_size:>10.1f} KB {resolution:>20}")

print("-" * 80)

# =============================================================================
# DATA QUALITY ANALYSIS
# =============================================================================

print("\n" + "=" * 80)
print("3. DATA QUALITY ANALYSIS")
print("=" * 80)

def calculate_quality_metrics(images: List[Dict[str, Any]]) -> Dict[str, float]:
    """Calculate quality metrics for image collection"""
    if not images:
        return {'resolution_score': 0, 'size_score': 0, 'diversity_score': 0}
    
    # Resolution score (higher is better, normalized)
    resolutions = [img['width'] * img['height'] for img in images]
    avg_resolution = np.mean(resolutions)
    resolution_score = min(avg_resolution / (1920 * 1080), 1.0)
    
    # Size score (consistent file sizes are better)
    sizes = [img['size'] for img in images]
    size_cv = np.std(sizes) / np.mean(sizes) if np.mean(sizes) > 0 else 1.0
    size_score = max(0, 1.0 - size_cv)
    
    # Diversity score (variety in dimensions)
    unique_dims = len(set((img['width'], img['height']) for img in images))
    diversity_score = min(unique_dims / len(images), 1.0)
    
    return {
        'resolution_score': resolution_score,
        'size_score': size_score,
        'diversity_score': diversity_score,
        'overall_score': (resolution_score + size_score + diversity_score) / 3
    }

# Calculate quality for each dataset
quality_metrics = {}
for images, name in [(ff_real, 'FF Real'), (ff_fake, 'FF Fake'),
                      (celebdf_real, 'Celeb Real'), (celebdf_fake, 'Celeb Fake')]:
    quality_metrics[name] = calculate_quality_metrics(images)

print("\nüìä Quality Metrics (0-1 scale, higher is better)")
print("-" * 60)
print(f"{'Dataset':<15} {'Resolution':>12} {'Consistency':>12} {'Diversity':>12} {'Overall':>12}")
print("-" * 60)

for name, metrics in quality_metrics.items():
    print(f"{name:<15} {metrics['resolution_score']:>12.3f} {metrics['size_score']:>12.3f} "
          f"{metrics['diversity_score']:>12.3f} {metrics['overall_score']:>12.3f}")

print("-" * 60)

# =============================================================================
# VISUALIZATION 1: Dataset Distribution
# =============================================================================

print("\n" + "=" * 80)
print("4. GENERATING VISUALIZATIONS")
print("=" * 80)

print("\nüìä Figure 1: Dataset Distribution")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Dataset Statistics Overview', fontsize=16, fontweight='bold')

# Plot 1: Dataset Size Comparison
ax = axes[0, 0]
datasets = ['FF Real', 'FF Fake', 'Celeb Real', 'Celeb Fake']
counts = [len(ff_real), len(ff_fake), len(celebdf_real), len(celebdf_fake)]
colors = ['#2ecc71', '#e74c3c', '#3498db', '#f39c12']

bars = ax.bar(datasets, counts, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Number of Images', fontsize=11, fontweight='bold')
ax.set_title('Dataset Size Comparison', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

for bar, count in zip(bars, counts):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{count}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Plot 2: Real vs Fake Distribution
ax = axes[0, 1]
real_total = len(ff_real) + len(celebdf_real)
fake_total = len(ff_fake) + len(celebdf_fake)
labels = ['Real Images', 'Fake Images']
sizes = [real_total, fake_total]
colors_pie = ['#2ecc71', '#e74c3c']

wedges, texts, autotexts = ax.pie(sizes, labels=labels, colors=colors_pie, autopct='%1.1f%%',
                                     startangle=90, textprops={'fontweight': 'bold'})
ax.set_title('Real vs Fake Distribution', fontsize=12, fontweight='bold')

# Plot 3: Average Resolution
ax = axes[1, 0]
resolutions = []
labels_res = []

for stats, label in [(ff_real_stats, 'FF Real'), (ff_fake_stats, 'FF Fake'),
                      (celebdf_real_stats, 'Celeb Real'), (celebdf_fake_stats, 'Celeb Fake')]:
    if stats['count'] > 0:
        resolutions.append(stats['avg_width'] * stats['avg_height'] / 1e6)  # Megapixels
        labels_res.append(label)

bars = ax.barh(labels_res, resolutions, color=colors[:len(resolutions)], alpha=0.7, edgecolor='black')
ax.set_xlabel('Resolution (Megapixels)', fontsize=11, fontweight='bold')
ax.set_title('Average Image Resolution', fontsize=12, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

for bar, res in zip(bars, resolutions):
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height()/2.,
            f'{res:.2f}MP', ha='left', va='center', fontsize=9, fontweight='bold')

# Plot 4: Quality Scores
ax = axes[1, 1]
quality_names = list(quality_metrics.keys())
overall_scores = [quality_metrics[name]['overall_score'] for name in quality_names]

bars = ax.barh(quality_names, overall_scores, color=colors[:len(quality_names)], alpha=0.7, edgecolor='black')
ax.set_xlabel('Quality Score', fontsize=11, fontweight='bold')
ax.set_title('Dataset Quality Assessment', fontsize=12, fontweight='bold')
ax.set_xlim(0, 1.0)
ax.grid(axis='x', alpha=0.3)

for bar, score in zip(bars, overall_scores):
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height()/2.,
            f'{score:.3f}', ha='left', va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'dataset_distribution.png', dpi=150, bbox_inches='tight')
print(f"   Saved: dataset_distribution.png")
plt.show()

# =============================================================================
# VISUALIZATION 2: Resolution Analysis
# =============================================================================

print("\nüìä Figure 2: Resolution Analysis")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Image Resolution Analysis', fontsize=16, fontweight='bold')

# Collect resolution data
all_widths = []
all_heights = []
all_labels = []

for images, label in [(ff_real, 'FF Real'), (ff_fake, 'FF Fake'),
                       (celebdf_real, 'Celeb Real'), (celebdf_fake, 'Celeb Fake')]:
    if images:
        widths = [img['width'] for img in images]
        heights = [img['height'] for img in images]
        all_widths.extend(widths)
        all_heights.extend(heights)
        all_labels.extend([label] * len(images))

# Plot 1: Width Distribution
ax = axes[0]
df_res = pd.DataFrame({'Width': all_widths, 'Dataset': all_labels})
for dataset in df_res['Dataset'].unique():
    data = df_res[df_res['Dataset'] == dataset]['Width']
    ax.hist(data, bins=20, alpha=0.5, label=dataset, edgecolor='black')

ax.set_xlabel('Image Width (pixels)', fontsize=11, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax.set_title('Width Distribution', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# Plot 2: Height Distribution
ax = axes[1]
df_res = pd.DataFrame({'Height': all_heights, 'Dataset': all_labels})
for dataset in df_res['Dataset'].unique():
    data = df_res[df_res['Dataset'] == dataset]['Height']
    ax.hist(data, bins=20, alpha=0.5, label=dataset, edgecolor='black')

ax.set_xlabel('Image Height (pixels)', fontsize=11, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax.set_title('Height Distribution', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'resolution_analysis.png', dpi=150, bbox_inches='tight')
print(f"   Saved: resolution_analysis.png")
plt.show()

# =============================================================================
# VISUALIZATION 3: File Size Analysis
# =============================================================================

print("\nüìä Figure 3: File Size Analysis")

fig, ax = plt.subplots(figsize=(12, 6))

# Collect file size data
size_data = []
for images, label in [(ff_real, 'FF Real'), (ff_fake, 'FF Fake'),
                       (celebdf_real, 'Celeb Real'), (celebdf_fake, 'Celeb Fake')]:
    if images:
        sizes = [img['size'] / 1024 for img in images]  # Convert to KB
        size_data.append(sizes)
    else:
        size_data.append([0])

# Create box plot
bp = ax.boxplot(size_data, labels=['FF Real', 'FF Fake', 'Celeb Real', 'Celeb Fake'],
                patch_artist=True, notch=True, showfliers=True)

# Color the boxes
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_ylabel('File Size (KB)', fontsize=12, fontweight='bold')
ax.set_title('File Size Distribution by Dataset', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'file_size_analysis.png', dpi=150, bbox_inches='tight')
print(f"   Saved: file_size_analysis.png")
plt.show()

# =============================================================================
# EXPORT STATISTICS
# =============================================================================

print("\n" + "=" * 80)
print("5. EXPORTING STATISTICS")
print("=" * 80)

# Compile all statistics
dataset_statistics = {
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    'datasets': {
        'faceforensics': {
            'real': ff_real_stats,
            'fake': ff_fake_stats
        },
        'celebdf': {
            'real': celebdf_real_stats,
            'fake': celebdf_fake_stats
        },
        'synthetic_attacks': synthetic_stats
    },
    'quality_metrics': quality_metrics,
    'summary': {
        'total_images': len(ff_real) + len(ff_fake) + len(celebdf_real) + len(celebdf_fake),
        'total_real': len(ff_real) + len(celebdf_real),
        'total_fake': len(ff_fake) + len(celebdf_fake),
        'balance_ratio': (len(ff_fake) + len(celebdf_fake)) / (len(ff_real) + len(celebdf_real)) if (len(ff_real) + len(celebdf_real)) > 0 else 0
    }
}

# Save to JSON
stats_file = RESULTS_DIR / 'dataset_statistics.json'
with open(stats_file, 'w') as f:
    json.dump(dataset_statistics, f, indent=2, default=str)

print(f" Statistics exported to: {stats_file}")

# Create markdown summary
summary_md = f"""# Dataset Analysis Summary

**Generated:** {dataset_statistics['timestamp']}

## Overview

- **Total Images:** {dataset_statistics['summary']['total_images']}
- **Real Images:** {dataset_statistics['summary']['total_real']}
- **Fake Images:** {dataset_statistics['summary']['total_fake']}
- **Balance Ratio:** {dataset_statistics['summary']['balance_ratio']:.3f} (Fake/Real)

## Dataset Breakdown

### FaceForensics++
- Real: {ff_real_stats['count']} images
- Fake: {ff_fake_stats['count']} images
- Avg Resolution (Real): {int(ff_real_stats['avg_width']) if ff_real_stats['count'] > 0 else 0}x{int(ff_real_stats['avg_height']) if ff_real_stats['count'] > 0 else 0}
- Avg Resolution (Fake): {int(ff_fake_stats['avg_width']) if ff_fake_stats['count'] > 0 else 0}x{int(ff_fake_stats['avg_height']) if ff_fake_stats['count'] > 0 else 0}

### Celeb-DF
- Real: {celebdf_real_stats['count']} images
- Fake: {celebdf_fake_stats['count']} images
- Avg Resolution (Real): {int(celebdf_real_stats['avg_width']) if celebdf_real_stats['count'] > 0 else 0}x{int(celebdf_real_stats['avg_height']) if celebdf_real_stats['count'] > 0 else 0}
- Avg Resolution (Fake): {int(celebdf_fake_stats['avg_width']) if celebdf_fake_stats['count'] > 0 else 0}x{int(celebdf_fake_stats['avg_height']) if celebdf_fake_stats['count'] > 0 else 0}

### Synthetic Attacks
- Scenarios: {synthetic_stats['scenarios']}
- Avg Submissions/Attack: {synthetic_stats['avg_submissions_per_attack']:.1f}

## Quality Assessment

| Dataset | Overall Score |
|---------|---------------|
| FF Real | {quality_metrics.get('FF Real', {}).get('overall_score', 0):.3f} |
| FF Fake | {quality_metrics.get('FF Fake', {}).get('overall_score', 0):.3f} |
| Celeb Real | {quality_metrics.get('Celeb Real', {}).get('overall_score', 0):.3f} |
| Celeb Fake | {quality_metrics.get('Celeb Fake', {}).get('overall_score', 0):.3f} |

## Research Implications

1. **Dataset Balance:** {"Good" if 0.8 <= dataset_statistics['summary']['balance_ratio'] <= 1.2 else "Needs attention"}
2. **Sample Size:** {"Sufficient" if dataset_statistics['summary']['total_images'] >= 100 else "Limited"}
3. **Quality:** {"High" if np.mean([m['overall_score'] for m in quality_metrics.values()]) > 0.7 else "Acceptable"}

## Generated Figures

1. `dataset_distribution.png` - Dataset overview and statistics
2. `resolution_analysis.png` - Image resolution distributions
3. `file_size_analysis.png` - File size comparisons

---
*Analysis generated by corruption-reporting-prototype evaluation framework*
"""

summary_file = RESULTS_DIR / 'dataset_analysis_summary.md'
with open(summary_file, 'w') as f:
    f.write(summary_md)

print(f" Summary exported to: {summary_file}")

# =============================================================================
# FINAL SUMMARY
# =============================================================================

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)

print(f"""
Dataset Analysis Summary
========================

üìä Datasets Analyzed:
   ‚Ä¢ FaceForensics++: {len(ff_real) + len(ff_fake)} images ({len(ff_real)} real, {len(ff_fake)} fake)
   ‚Ä¢ Celeb-DF: {len(celebdf_real) + len(celebdf_fake)} images ({len(celebdf_real)} real, {len(celebdf_fake)} fake)
   ‚Ä¢ Synthetic Attacks: {synthetic_stats['scenarios']} scenarios
   ‚Ä¢ Total: {dataset_statistics['summary']['total_images']} images

üìà Quality Metrics:
   ‚Ä¢ Average Quality Score: {np.mean([m['overall_score'] for m in quality_metrics.values()]):.3f}
   ‚Ä¢ Dataset Balance Ratio: {dataset_statistics['summary']['balance_ratio']:.3f}

üìÅ Outputs Generated:
   ‚Ä¢ 3 visualization figures
   ‚Ä¢ JSON statistics file
   ‚Ä¢ Markdown summary report

 All analysis complete and saved!
""")

print("=" * 80)
