# Document Forgery Detection - Data Exploration

This notebook focuses on exploring and understanding document image datasets for forgery detection.

## Objectives:
- Load and examine document images
- Analyze image properties and metadata
- Identify potential forgery indicators
- Visualize data distributions and patterns

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image, ExifTags
from scipy import stats

# Add src to path
sys.path.append('../src')
from features.build_features import DocumentFeatureExtractor
from visualization.visualize import DocumentForgeryVisualizer

plt.style.use('seaborn-v0_8')
%matplotlib inline

print("Libraries imported successfully!")

## 1. Dataset Overview

In [None]:
# Define paths
DATA_PATH = '../data/raw'
data_dir = Path(DATA_PATH)

# Find all image files
image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
image_files = []

for ext in image_extensions:
    image_files.extend(data_dir.glob(f'**/*{ext}'))
    image_files.extend(data_dir.glob(f'**/*{ext.upper()}'))

print(f"Found {len(image_files)} image files")

if len(image_files) > 0:
    print(f"\nSample files:")
    for i, img_path in enumerate(image_files[:10]):
        print(f"  {i+1}. {img_path}")
else:
    print("\n⚠️ No images found. Please add images to the data/raw directory.")

## 2. Image Properties Analysis

In [None]:
def analyze_image_properties(image_files, max_images=50):
    """Analyze basic properties of images"""
    properties = []
    
    for img_path in image_files[:max_images]:
        try:
            # Load image
            img = Image.open(img_path)
            img_cv = cv2.imread(str(img_path))
            
            # Basic properties
            prop = {
                'filename': img_path.name,
                'path': str(img_path),
                'width': img.width,
                'height': img.height,
                'aspect_ratio': img.width / img.height,
                'file_size_kb': img_path.stat().st_size / 1024,
                'format': img.format,
                'mode': img.mode,
                'has_exif': bool(img._getexif())
            }
            
            # Determine class from path/filename
            path_str = str(img_path).lower()
            if any(word in path_str for word in ['authentic', 'real', 'original', 'genuine']):
                prop['class'] = 'authentic'
            elif any(word in path_str for word in ['forged', 'fake', 'manipulated', 'tampered']):
                prop['class'] = 'forged'
            else:
                prop['class'] = 'unknown'
            
            # Color statistics
            if img_cv is not None:
                gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
                prop['mean_intensity'] = np.mean(gray)
                prop['std_intensity'] = np.std(gray)
                prop['brightness'] = np.mean(img_cv)
            
            properties.append(prop)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
    
    return pd.DataFrame(properties)

if len(image_files) > 0:
    print("Analyzing image properties...")
    df_props = analyze_image_properties(image_files)
    
    print(f"\n📊 Dataset Summary:")
    print(f"Total images analyzed: {len(df_props)}")
    print(f"\nClass distribution:")
    print(df_props['class'].value_counts())
    
    print(f"\nImage dimensions:")
    print(df_props[['width', 'height']].describe())
else:
    df_props = pd.DataFrame()

## 3. Visual Analysis of Image Properties

In [None]:
if not df_props.empty:
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # 1. Image dimensions
    axes[0, 0].scatter(df_props['width'], df_props['height'], 
                      c=df_props['class'].map({'authentic': 'blue', 'forged': 'red', 'unknown': 'gray'}),
                      alpha=0.6)
    axes[0, 0].set_xlabel('Width (pixels)')
    axes[0, 0].set_ylabel('Height (pixels)')
    axes[0, 0].set_title('Image Dimensions by Class')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. File size distribution
    for class_name in df_props['class'].unique():
        if class_name != 'unknown':
            class_data = df_props[df_props['class'] == class_name]
            axes[0, 1].hist(class_data['file_size_kb'], bins=20, alpha=0.6, label=class_name)
    axes[0, 1].set_xlabel('File Size (KB)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('File Size Distribution')
    axes[0, 1].legend()
    
    # 3. Aspect ratio
    sns.boxplot(data=df_props, x='class', y='aspect_ratio', ax=axes[0, 2])
    axes[0, 2].set_title('Aspect Ratio by Class')
    axes[0, 2].tick_params(axis='x', rotation=45)
    
    # 4. Mean intensity
    if 'mean_intensity' in df_props.columns:
        sns.boxplot(data=df_props, x='class', y='mean_intensity', ax=axes[1, 0])
        axes[1, 0].set_title('Mean Intensity by Class')
        axes[1, 0].tick_params(axis='x', rotation=45)
    
    # 5. Standard deviation of intensity
    if 'std_intensity' in df_props.columns:
        sns.boxplot(data=df_props, x='class', y='std_intensity', ax=axes[1, 1])
        axes[1, 1].set_title('Intensity Std Dev by Class')
        axes[1, 1].tick_params(axis='x', rotation=45)
    
    # 6. Format distribution
    format_counts = df_props['format'].value_counts()
    axes[1, 2].pie(format_counts.values, labels=format_counts.index, autopct='%1.1f%%')
    axes[1, 2].set_title('Image Format Distribution')
    
    plt.tight_layout()
    plt.show()

## 4. Feature-based Analysis

In [None]:
if len(image_files) > 5:
    print("Extracting advanced features for analysis...")
    
    # Extract features from a sample of images
    extractor = DocumentFeatureExtractor()
    sample_files = image_files[:20]  # Analyze first 20 images
    
    features_data = []
    for img_path in sample_files:
        print(f"Processing {img_path.name}...")
        features = extractor.extract_all_features(str(img_path))
        
        if features:
            features['filename'] = img_path.name
            features['filepath'] = str(img_path)
            
            # Determine class
            path_str = str(img_path).lower()
            if any(word in path_str for word in ['authentic', 'real', 'original']):
                features['class'] = 'authentic'
            elif any(word in path_str for word in ['forged', 'fake', 'manipulated']):
                features['class'] = 'forged'
            else:
                features['class'] = 'unknown'
            
            features_data.append(features)
    
    if features_data:
        df_features = pd.DataFrame(features_data)
        print(f"\n✅ Features extracted for {len(df_features)} images")
        print(f"Feature count: {len(df_features.columns) - 3}")
        
        # Show feature statistics by class
        numeric_cols = df_features.select_dtypes(include=[np.number]).columns
        
        if len(numeric_cols) > 0:
            print(f"\n📊 Feature summary by class:")
            summary = df_features.groupby('class')[numeric_cols].mean()
            print(summary.round(4))
    else:
        df_features = pd.DataFrame()
        print("❌ No features could be extracted")
else:
    df_features = pd.DataFrame()
    print("⚠️ Not enough images for feature analysis")

## 5. Advanced Feature Visualization

In [None]:
if not df_features.empty and len(df_features) > 5:
    # Select interesting features for visualization
    interesting_features = [
        'mean', 'std', 'entropy', 'sobel_mean', 'canny_edge_density',
        'laplacian_var', 'fft_energy', 'lbp_contrast', 'glcm_contrast_mean'
    ]
    
    # Filter to available features
    available_features = [f for f in interesting_features if f in df_features.columns]
    
    if len(available_features) >= 4:
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Box plots for each feature
        for i, feature in enumerate(available_features[:4]):
            row, col = i // 2, i % 2
            
            if len(df_features['class'].unique()) > 1:
                sns.boxplot(data=df_features, x='class', y=feature, ax=axes[row, col])
            else:
                df_features[feature].hist(ax=axes[row, col], bins=10)
            
            axes[row, col].set_title(f'{feature} Distribution')
            axes[row, col].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        # Correlation analysis
        if len(available_features) > 1:
            print("\n🔍 Feature Correlation Analysis:")
            corr_matrix = df_features[available_features].corr()
            
            plt.figure(figsize=(12, 10))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                       square=True, fmt='.2f')
            plt.title('Feature Correlation Matrix')
            plt.tight_layout()
            plt.show()
    
    else:
        print(f"⚠️ Not enough features for advanced visualization")
        print(f"Available features: {list(df_features.columns)}")

## 6. Statistical Analysis

In [None]:
if not df_features.empty and len(df_features) > 5:
    # Statistical tests to find discriminative features
    print("🧮 Statistical Analysis of Features:")
    
    # Filter to classes with enough samples
    class_counts = df_features['class'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index.tolist()
    
    if len(valid_classes) >= 2:
        df_filtered = df_features[df_features['class'].isin(valid_classes)]
        numeric_features = df_filtered.select_dtypes(include=[np.number]).columns
        
        # Perform t-tests or Mann-Whitney U tests
        significant_features = []
        
        for feature in numeric_features:
            try:
                groups = [df_filtered[df_filtered['class'] == cls][feature].values 
                         for cls in valid_classes]
                
                # Remove NaN values
                groups = [g[~np.isnan(g)] for g in groups]
                
                if all(len(g) > 1 for g in groups):
                    # Use Mann-Whitney U test (non-parametric)
                    statistic, p_value = stats.mannwhitneyu(groups[0], groups[1], 
                                                           alternative='two-sided')
                    
                    if p_value < 0.05:  # Significant difference
                        significant_features.append({
                            'feature': feature,
                            'p_value': p_value,
                            'statistic': statistic
                        })
                        
            except Exception as e:
                pass  # Skip features that cause errors
        
        if significant_features:
            # Sort by p-value
            significant_features.sort(key=lambda x: x['p_value'])
            
            print(f"\n📈 Most discriminative features (p < 0.05):")
            for i, feat in enumerate(significant_features[:10]):
                print(f"{i+1:2d}. {feat['feature']:<25} (p = {feat['p_value']:.4f})")
                
            # Visualize top discriminative features
            if len(significant_features) >= 2:
                top_features = [f['feature'] for f in significant_features[:4]]
                
                fig, axes = plt.subplots(2, 2, figsize=(15, 10))
                axes = axes.flatten()
                
                for i, feature in enumerate(top_features):
                    sns.boxplot(data=df_filtered, x='class', y=feature, ax=axes[i])
                    axes[i].set_title(f'{feature}\n(p = {significant_features[i]["p_value"]:.4f})')
                    axes[i].tick_params(axis='x', rotation=45)
                
                plt.suptitle('Most Discriminative Features', fontsize=16, fontweight='bold')
                plt.tight_layout()
                plt.show()
        else:
            print("⚠️ No statistically significant features found")
    
    else:
        print("⚠️ Need at least 2 samples per class for statistical analysis")
else:
    print("⚠️ No feature data available for statistical analysis")

## 7. Summary and Insights

In [None]:
print("📋 DATA EXPLORATION SUMMARY")
print("=" * 50)

print(f"📁 Total images found: {len(image_files)}")

if not df_props.empty:
    print(f"\n📊 Basic Properties:")
    print(f"   - Image formats: {', '.join(df_props['format'].unique())}")
    print(f"   - Size range: {df_props['file_size_kb'].min():.1f} - {df_props['file_size_kb'].max():.1f} KB")
    print(f"   - Resolution range: {df_props['width'].min()}x{df_props['height'].min()} to {df_props['width'].max()}x{df_props['height'].max()}")
    
    class_dist = df_props['class'].value_counts()
    print(f"\n🏷️ Class Distribution:")
    for class_name, count in class_dist.items():
        print(f"   - {class_name}: {count} images ({count/len(df_props)*100:.1f}%)")

if not df_features.empty:
    print(f"\n🔧 Advanced Features:")
    print(f"   - Features extracted: {len(df_features.columns) - 3}")
    print(f"   - Samples analyzed: {len(df_features)}")
    
    if 'significant_features' in locals() and significant_features:
        print(f"   - Discriminative features found: {len(significant_features)}")
        print(f"   - Most discriminative: {significant_features[0]['feature']} (p={significant_features[0]['p_value']:.4f})")

print("\n💡 Key Insights:")

if not df_props.empty:
    # Analyze potential issues
    insights = []
    
    if df_props['class'].value_counts().get('unknown', 0) > 0:
        insights.append("Some images couldn't be automatically classified - consider better file organization")
    
    if len(df_props['format'].unique()) > 2:
        insights.append("Multiple image formats detected - consider standardizing")
    
    size_cv = df_props['file_size_kb'].std() / df_props['file_size_kb'].mean()
    if size_cv > 1.0:
        insights.append("High variation in file sizes - may indicate different compression levels")
    
    aspect_ratios = df_props['aspect_ratio'].unique()
    if len(aspect_ratios) > 5:
        insights.append("Many different aspect ratios - documents may need standardized preprocessing")
    
    if insights:
        for i, insight in enumerate(insights, 1):
            print(f"   {i}. {insight}")
    else:
        print("   - Dataset appears well-structured for analysis")

else:
    print("   - Add document images to data/raw/ directory to begin analysis")

print("\n🚀 Next Steps:")
print("   1. Ensure balanced dataset with enough samples per class")
print("   2. Standardize image preprocessing (size, format, etc.)")
print("   3. Extract features for all images in the dataset")
print("   4. Train models using the most discriminative features")
print("   5. Validate performance on held-out test data")