# 01 - Exploratory Data Analysis

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IvanNece/Detection-of-Anomalies-with-Localization/blob/main/notebooks/01_data_exploration.ipynb)

**Project**: Detection of Anomalies with Localization  
**Dataset**: MVTec AD (Hazelnut, Carpet, Zipper)  
**Phase**: Data Exploration

---

## Objectives

1. Analyze dataset structure and organization
2. Count images per class and split (train/test, normal/anomalous)
3. Visualize representative samples (normal and anomalous)
4. Compute image dimension statistics
5. Analyze defect type distributions
6. Visualize ground truth masks

In [None]:
# ============================================================
# SETUP - Mount Google Drive & Clone Repository
# ============================================================

from google.colab import drive
from pathlib import Path

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Done!\n")

# Clone repository
print("Cloning repository...")
!git clone https://github.com/IvanNece/Detection-of-Anomalies-with-Localization.git
print("Done!\n")

# Setup paths
PROJECT_ROOT = Path('/content/Detection-of-Anomalies-with-Localization')
DRIVE_DATASET = Path('/content/drive/MyDrive/mvtec_ad')
LOCAL_DATASET = Path('/content/mvtec_ad')
OUTPUT_DIR = Path('/content/drive/MyDrive/anomaly_detection_project/eda_outputs')

# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Copy dataset from Drive to local (faster access)
print("Copying dataset from Drive to local storage...")
if not LOCAL_DATASET.exists():
    !mkdir -p /content/mvtec_ad
    !cp -r /content/drive/MyDrive/mvtecad/hazelnut /content/mvtec_ad/
    !cp -r /content/drive/MyDrive/mvtecad/carpet /content/mvtec_ad/
    !cp -r /content/drive/MyDrive/mvtecad/zipper /content/mvtec_ad/
    print("Done!")
else:
    print("Dataset already in local storage")

print("\n" + "="*70)
print("SETUP COMPLETE")
print("="*70)
print(f"Project:  {PROJECT_ROOT}")
print(f"Dataset:  {LOCAL_DATASET}")
print(f"Outputs:  {OUTPUT_DIR}")
print("="*70)


In [None]:
# ============================================================
# IMPORTS
# ============================================================

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from collections import defaultdict

# Add project to path
sys.path.insert(0, str(PROJECT_ROOT))

# Import project modules
from src.utils.reproducibility import set_seed
from src.utils.config import Config

# Plotting settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("All imports loaded!")


In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

# Load project configuration
config = Config.load(PROJECT_ROOT / 'configs' / 'experiment_config.yaml')

# Classes to analyze
CLASSES = ['hazelnut', 'carpet', 'zipper']

# Set random seed for reproducibility
set_seed(config.seed)

print(f"Configuration loaded")
print(f"Analyzing classes: {CLASSES}")
print(f"Random seed: {config.seed}")


## 1. Dataset Structure Analysis

In [None]:
def explore_class_structure(class_name: str) -> dict:
    """Explore directory structure for a given class."""
    class_path = LOCAL_DATASET / class_name
    
    structure = {
        'class': class_name,
        'train_good': [],
        'test_good': [],
        'test_defects': {},
        'ground_truth': {}
    }
    
    # Train images (all normal)
    train_path = class_path / 'train' / 'good'
    if train_path.exists():
        structure['train_good'] = sorted(list(train_path.glob('*.png')))
    
    # Test normal images
    test_good_path = class_path / 'test' / 'good'
    if test_good_path.exists():
        structure['test_good'] = sorted(list(test_good_path.glob('*.png')))
    
    # Test anomalous images (by defect type)
    test_path = class_path / 'test'
    if test_path.exists():
        for defect_dir in test_path.iterdir():
            if defect_dir.is_dir() and defect_dir.name != 'good':
                defect_type = defect_dir.name
                structure['test_defects'][defect_type] = sorted(list(defect_dir.glob('*.png')))
    
    # Ground truth masks
    gt_path = class_path / 'ground_truth'
    if gt_path.exists():
        for defect_dir in gt_path.iterdir():
            if defect_dir.is_dir():
                defect_type = defect_dir.name
                structure['ground_truth'][defect_type] = sorted(list(defect_dir.glob('*.png')))
    
    return structure

# Explore all classes
dataset_structure = {}
for class_name in CLASSES:
    print(f"Analyzing {class_name}...")
    structure = explore_class_structure(class_name)
    dataset_structure[class_name] = structure
    
    # Print summary
    print(f"  Train (normal): {len(structure['train_good'])}")
    print(f"  Test (normal):  {len(structure['test_good'])}")
    print(f"  Defect types:   {len(structure['test_defects'])}")
    for defect, imgs in structure['test_defects'].items():
        print(f"    - {defect}: {len(imgs)}")


## 2. Image Count Statistics

In [None]:
def create_count_dataframe(dataset_structure: Dict) -> pd.DataFrame:
    """
    Create comprehensive dataframe with image counts.
    
    Args:
        dataset_structure: Dataset structure dictionary
    
    Returns:
        DataFrame with counts per class and split
    """
    data = []
    
    for class_name, structure in dataset_structure.items():
        # Normal images
        data.append({
            'Class': class_name,
            'Split': 'Train',
            'Type': 'Normal',
            'Defect': 'good',
            'Count': len(structure['train_good'])
        })
        
        data.append({
            'Class': class_name,
            'Split': 'Test',
            'Type': 'Normal',
            'Defect': 'good',
            'Count': len(structure['test_good'])
        })
        
        # Anomalous images by defect type
        for defect, imgs in structure['test_defects'].items():
            data.append({
                'Class': class_name,
                'Split': 'Test',
                'Type': 'Anomalous',
                'Defect': defect,
                'Count': len(imgs)
            })
    
    return pd.DataFrame(data)

# Create dataframe
df_counts = create_count_dataframe(dataset_structure)

# Display summary
print("\n" + "="*60)
print("DATASET SUMMARY")
print("="*60)
print(df_counts.to_string(index=False))

# Summary statistics
print("\n" + "="*60)
print("SUMMARY BY CLASS")
print("="*60)
summary = df_counts.groupby(['Class', 'Type'])['Count'].sum().reset_index()
print(summary.to_string(index=False))

# Total counts
print("\n" + "="*60)
total_normal = df_counts[df_counts['Type'] == 'Normal']['Count'].sum()
total_anomalous = df_counts[df_counts['Type'] == 'Anomalous']['Count'].sum()
print(f"Total Normal Images:    {total_normal}")
print(f"Total Anomalous Images: {total_anomalous}")
print(f"Total Images:           {total_normal + total_anomalous}")
print(f"Anomaly Ratio:          {total_anomalous / (total_normal + total_anomalous):.2%}")

### Visualization: Image Counts

In [None]:
# Plot 1: Normal vs Anomalous by Class
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Stacked bar chart
pivot_data = df_counts.groupby(['Class', 'Type'])['Count'].sum().unstack()
pivot_data.plot(kind='bar', stacked=True, ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Image Distribution: Normal vs Anomalous', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class', fontsize=12)
axes[0].set_ylabel('Number of Images', fontsize=12)
axes[0].legend(title='Type', loc='upper right')
axes[0].grid(axis='y', alpha=0.3)
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)

# Train vs Test split
split_data = df_counts[df_counts['Type'] == 'Normal'].groupby(['Class', 'Split'])['Count'].sum().unstack()
split_data.plot(kind='bar', ax=axes[1], color=['#3498db', '#9b59b6'])
axes[1].set_title('Normal Images: Train vs Test Split', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Class', fontsize=12)
axes[1].set_ylabel('Number of Images', fontsize=12)
axes[1].legend(title='Split', loc='upper right')
axes[1].grid(axis='y', alpha=0.3)
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=0)

plt.tight_layout()
plt.show()

# Plot 2: Defect Types Distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, class_name in enumerate(CLASSES):
    class_data = df_counts[(df_counts['Class'] == class_name) & (df_counts['Type'] == 'Anomalous')]
    
    axes[idx].barh(class_data['Defect'], class_data['Count'], color='#e67e22')
    axes[idx].set_title(f'{class_name.capitalize()} - Defect Types', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Number of Images', fontsize=10)
    axes[idx].grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(class_data['Count']):
        axes[idx].text(v + 0.5, i, str(v), va='center', fontsize=9)

plt.tight_layout()
plt.show()

## 3. Image Dimension Analysis

In [None]:
def analyze_image_dimensions(dataset_structure: Dict, sample_size: int = 50) -> pd.DataFrame:
    """
    Analyze image dimensions across dataset.
    
    Args:
        dataset_structure: Dataset structure dictionary
        sample_size: Number of images to sample per class
    
    Returns:
        DataFrame with dimension statistics
    """
    data = []
    
    for class_name, structure in dataset_structure.items():
        print(f"Analyzing dimensions for {class_name}...")
        
        # Sample images from train
        train_sample = structure['train_good'][:sample_size]
        
        for img_path in tqdm(train_sample, desc=f"  {class_name}"):
            img = Image.open(img_path)
            width, height = img.size
            
            data.append({
                'Class': class_name,
                'Width': width,
                'Height': height,
                'Aspect_Ratio': width / height,
                'Megapixels': (width * height) / 1e6
            })
    
    return pd.DataFrame(data)

# Analyze dimensions
df_dimensions = analyze_image_dimensions(dataset_structure, sample_size=50)

# Display statistics
print("\n" + "="*60)
print("IMAGE DIMENSION STATISTICS")
print("="*60)
print(df_dimensions.groupby('Class').describe().round(2))

### Visualization: Image Dimensions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Resolution distribution
for class_name in CLASSES:
    class_data = df_dimensions[df_dimensions['Class'] == class_name]
    axes[0].scatter(class_data['Width'], class_data['Height'], 
                   label=class_name, alpha=0.6, s=100)

axes[0].set_title('Image Resolution Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Width (pixels)', fontsize=12)
axes[0].set_ylabel('Height (pixels)', fontsize=12)
axes[0].legend()
axes[0].grid(alpha=0.3)

# Aspect ratio distribution
df_dimensions.boxplot(column='Aspect_Ratio', by='Class', ax=axes[1])
axes[1].set_title('Aspect Ratio Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Class', fontsize=12)
axes[1].set_ylabel('Aspect Ratio', fontsize=12)
plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

# Summary table
print("\n" + "="*60)
print("RESOLUTION SUMMARY")
print("="*60)
resolution_summary = df_dimensions.groupby('Class').agg({
    'Width': ['mean', 'std', 'min', 'max'],
    'Height': ['mean', 'std', 'min', 'max'],
    'Aspect_Ratio': ['mean', 'std']
}).round(2)
print(resolution_summary)

## 4. Visual Inspection: Sample Images

In [None]:
def visualize_samples(dataset_structure: Dict, n_samples: int = 3):
    """
    Visualize sample images from each class.
    
    Args:
        dataset_structure: Dataset structure dictionary
        n_samples: Number of samples per category
    """
    for class_name, structure in dataset_structure.items():
        print(f"\n{'='*60}")
        print(f"CLASS: {class_name.upper()}")
        print(f"{'='*60}")
        
        # Normal images (train)
        fig, axes = plt.subplots(1, n_samples, figsize=(15, 5))
        fig.suptitle(f'{class_name.capitalize()} - Normal Images (Train)', 
                    fontsize=14, fontweight='bold')
        
        for idx in range(n_samples):
            img_path = structure['train_good'][idx * len(structure['train_good']) // n_samples]
            img = Image.open(img_path)
            axes[idx].imshow(img)
            axes[idx].axis('off')
            axes[idx].set_title(f"Sample {idx+1}\n{img.size[0]}x{img.size[1]}", fontsize=10)
        
        plt.tight_layout()
        plt.show()
        
        # Anomalous images (sample from different defect types)
        defect_types = list(structure['test_defects'].keys())[:n_samples]
        
        if defect_types:
            fig, axes = plt.subplots(1, len(defect_types), figsize=(15, 5))
            if len(defect_types) == 1:
                axes = [axes]
            
            fig.suptitle(f'{class_name.capitalize()} - Anomalous Images', 
                        fontsize=14, fontweight='bold')
            
            for idx, defect in enumerate(defect_types):
                img_path = structure['test_defects'][defect][0]
                img = Image.open(img_path)
                axes[idx].imshow(img)
                axes[idx].axis('off')
                axes[idx].set_title(f"Defect: {defect}\n{img.size[0]}x{img.size[1]}", fontsize=10)
            
            plt.tight_layout()
            plt.show()

# Visualize samples
visualize_samples(dataset_structure, n_samples=3)

## 5. Ground Truth Mask Visualization

In [None]:
def visualize_masks(dataset_structure: Dict, n_samples: int = 2):
    """
    Visualize anomalous images with their ground truth masks.
    
    Args:
        dataset_structure: Dataset structure dictionary
        n_samples: Number of samples to show per class
    """
    for class_name, structure in dataset_structure.items():
        print(f"\n{'='*60}")
        print(f"MASKS: {class_name.upper()}")
        print(f"{'='*60}")
        
        defect_types = list(structure['test_defects'].keys())[:n_samples]
        
        for defect in defect_types:
            if defect in structure['ground_truth']:
                fig, axes = plt.subplots(1, 3, figsize=(15, 5))
                fig.suptitle(f'{class_name.capitalize()} - Defect: {defect}', 
                            fontsize=14, fontweight='bold')
                
                # Image
                img_path = structure['test_defects'][defect][0]
                img = Image.open(img_path)
                axes[0].imshow(img)
                axes[0].axis('off')
                axes[0].set_title('Anomalous Image', fontsize=12)
                
                # Mask
                mask_path = structure['ground_truth'][defect][0]
                mask = Image.open(mask_path)
                axes[1].imshow(mask, cmap='gray')
                axes[1].axis('off')
                axes[1].set_title('Ground Truth Mask', fontsize=12)
                
                # Overlay
                img_array = np.array(img)
                mask_array = np.array(mask)
                overlay = img_array.copy()
                overlay[mask_array > 0] = [255, 0, 0]  # Red overlay on defects
                axes[2].imshow(overlay)
                axes[2].axis('off')
                axes[2].set_title('Overlay (Defects in Red)', fontsize=12)
                
                plt.tight_layout()
                plt.show()

# Visualize masks
visualize_masks(dataset_structure, n_samples=2)

## 6. Key Findings & Conclusions

In [None]:
print("\n" + "="*80)
print("KEY FINDINGS FROM EXPLORATORY DATA ANALYSIS")
print("="*80)

# 1. Class balance
print("\n1. CLASS DISTRIBUTION")
print("-" * 40)
for class_name in CLASSES:
    class_data = df_counts[df_counts['Class'] == class_name]
    n_normal = class_data[class_data['Type'] == 'Normal']['Count'].sum()
    n_anomalous = class_data[class_data['Type'] == 'Anomalous']['Count'].sum()
    ratio = n_anomalous / (n_normal + n_anomalous) * 100
    print(f"  {class_name.capitalize():10s}: {n_normal:3d} normal, {n_anomalous:3d} anomalous ({ratio:.1f}% anomaly rate)")

# 2. Image dimensions
print("\n2. IMAGE DIMENSIONS")
print("-" * 40)
for class_name in CLASSES:
    class_dims = df_dimensions[df_dimensions['Class'] == class_name]
    avg_w = class_dims['Width'].mean()
    avg_h = class_dims['Height'].mean()
    print(f"  {class_name.capitalize():10s}: {avg_w:.0f} x {avg_h:.0f} pixels (avg)")

# 3. Defect diversity
print("\n3. DEFECT TYPE DIVERSITY")
print("-" * 40)
for class_name in CLASSES:
    n_defects = len(dataset_structure[class_name]['test_defects'])
    defect_names = list(dataset_structure[class_name]['test_defects'].keys())
    print(f"  {class_name.capitalize():10s}: {n_defects} defect types - {', '.join(defect_names[:5])}")

# 4. Recommendations
print("\n4. RECOMMENDATIONS FOR PREPROCESSING")
print("-" * 40)
print("  - Resize all images to 224x224 (ResNet standard)")
print("  - Maintain aspect ratio during resizing to avoid distortion")
print("  - Apply ImageNet normalization for pre-trained backbone")
print("  - Consider data augmentation for anomalous samples (limited quantity)")
print("  - Ensure ground truth masks are resized consistently with images")

print("\n" + "="*80)
print("EDA COMPLETE - Ready for data preparation phase")
print("="*80)

## Next Steps

Proceed to **Notebook 02: Data Preparation** to:
1. Implement data splitting logic (Train/Val/Test)
2. Create MVTecDataset class
3. Implement preprocessing transforms
4. Save split configurations for reproducibility

## Save Results to Google Drive

All outputs will be automatically saved to your Google Drive for persistence.

In [None]:
# ============================================================
# SAVE RESULTS TO GOOGLE DRIVE
# ============================================================

import json

print("Saving results to Google Drive...")

# Save dataset statistics
df_counts.to_csv(OUTPUT_DIR / 'dataset_counts.csv', index=False)
df_dimensions.to_csv(OUTPUT_DIR / 'image_dimensions.csv', index=False)

# Save analysis report
report_path = OUTPUT_DIR / 'eda_report.txt'
with open(report_path, 'w') as f:
    f.write("="*80 + "\n")
    f.write("EXPLORATORY DATA ANALYSIS REPORT\n")
    f.write("="*80 + "\n\n")
    
    f.write("1. CLASS DISTRIBUTION\n")
    f.write("-" * 40 + "\n")
    for class_name in CLASSES:
        class_data = df_counts[df_counts['Class'] == class_name]
        n_normal = class_data[class_data['Type'] == 'Normal']['Count'].sum()
        n_anomalous = class_data[class_data['Type'] == 'Anomalous']['Count'].sum()
        ratio = n_anomalous / (n_normal + n_anomalous) * 100
        f.write(f"  {class_name.capitalize():10s}: {n_normal:3d} normal, {n_anomalous:3d} anomalous ({ratio:.1f}% anomaly rate)\n")
    
    f.write("\n2. IMAGE DIMENSIONS\n")
    f.write("-" * 40 + "\n")
    for class_name in CLASSES:
        class_dims = df_dimensions[df_dimensions['Class'] == class_name]
        avg_w = class_dims['Width'].mean()
        avg_h = class_dims['Height'].mean()
        f.write(f"  {class_name.capitalize():10s}: {avg_w:.0f} x {avg_h:.0f} pixels (avg)\n")
    
    f.write("\n3. DEFECT TYPE DIVERSITY\n")
    f.write("-" * 40 + "\n")
    for class_name in CLASSES:
        n_defects = len(dataset_structure[class_name]['test_defects'])
        defect_names = list(dataset_structure[class_name]['test_defects'].keys())
        f.write(f"  {class_name.capitalize():10s}: {n_defects} defect types - {', '.join(defect_names[:5])}\n")

# Save metadata
metadata = {
    'classes': CLASSES,
    'total_images': {
        'normal': int(df_counts[df_counts['Type'] == 'Normal']['Count'].sum()),
        'anomalous': int(df_counts[df_counts['Type'] == 'Anomalous']['Count'].sum())
    },
    'class_details': {}
}

for class_name in CLASSES:
    class_data = df_counts[df_counts['Class'] == class_name]
    metadata['class_details'][class_name] = {
        'normal': int(class_data[class_data['Type'] == 'Normal']['Count'].sum()),
        'anomalous': int(class_data[class_data['Type'] == 'Anomalous']['Count'].sum()),
        'defect_types': list(dataset_structure[class_name]['test_defects'].keys())
    }

with open(OUTPUT_DIR / 'dataset_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Results saved to: {OUTPUT_DIR}")
print("\nFiles saved:")
print("  - dataset_counts.csv")
print("  - image_dimensions.csv")
print("  - eda_report.txt")
print("  - dataset_metadata.json")


## Download Results ZIP

Download all generated files to your local machine.

In [None]:
# ============================================================
# DOWNLOAD RESULTS ZIP
# ============================================================

from google.colab import files

print("Creating ZIP archive...")
output_zip = 'notebook_01_eda_outputs.zip'
!zip -r {output_zip} {OUTPUT_DIR}

print(f"Downloading: {output_zip}")
files.download(output_zip)

print("\nDone!")
print("Next steps:")
print("  1. Extract ZIP file")
print("  2. Copy files to local project")
print("  3. git add results/")
print("  4. git commit -m 'Add notebook 01 outputs'")
print("  5. git push origin main")
