# 01. Exploratory Data Analysis (EDA)

## Introduction
This notebook explores the MVTec AD dataset, visualizing samples and checking statistics. It is designed to be self-contained and does not rely on external project modules.

## Objectives
1. Visualize normal vs. anomalous images.
2. Check the distribution of categories.
3. Understand the directory structure.

In [None]:
import os
import glob
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

# Configuration
DATA_DIR = "../data/raw"

# Check if data exists
if not os.path.exists(DATA_DIR):
    print(f"WARNING: Data directory {DATA_DIR} not found. Please download the dataset first.")
else:
    print(f"Data directory found: {DATA_DIR}")

## 1. Dataset Overview
We list all available categories in the dataset.

In [None]:
CATEGORIES = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
CATEGORIES.sort()
print(f"Found {len(CATEGORIES)} categories:")
print(CATEGORIES)

## 2. Visualizing Samples
For each category, we display normal training images and examples of anomalies from the test set.

In [None]:
def show_samples(category, num_samples=5):
    train_path = os.path.join(DATA_DIR, category, 'train', 'good')
    test_path = os.path.join(DATA_DIR, category, 'test')
    
    # Get training samples (Normal)
    train_imgs = glob.glob(os.path.join(train_path, '*.png'))[:num_samples]
    
    # Get test samples (Anomalies)
    # MVTec AD test set has subfolders for different defect types + 'good'
    defect_types = [d for d in os.listdir(test_path) if d != 'good']
    defect_imgs = []
    defect_labels = []
    
    for dt in defect_types:
        found = glob.glob(os.path.join(test_path, dt, '*.png'))
        if found:
            defect_imgs.append(found[0]) # Take one example per defect type
            defect_labels.append(dt)
            
    # Limit to num_samples for display
    defect_imgs = defect_imgs[:num_samples]
    defect_labels = defect_labels[:num_samples]
        
    plt.figure(figsize=(15, 6))
    plt.suptitle(f"Category: {category}", fontsize=16)
    
    # Plot Normal
    for i, img_path in enumerate(train_imgs):
        plt.subplot(2, num_samples, i+1)
        img = Image.open(img_path)
        plt.imshow(img)
        plt.title("Train: Normal")
        plt.axis('off')
        
    # Plot Defects
    for i, (img_path, label) in enumerate(zip(defect_imgs, defect_labels)):
        plt.subplot(2, num_samples, num_samples + i + 1)
        img = Image.open(img_path)
        plt.imshow(img)
        plt.title(f"Test: {label}")
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize a few categories
for cat in CATEGORIES[:3]:
    show_samples(cat)

## 3. Image Statistics
Let's check the image sizes to ensure they are consistent.

In [None]:
sizes = []
for cat in CATEGORIES:
    img_path = glob.glob(os.path.join(DATA_DIR, cat, 'train', 'good', '*.png'))[0]
    img = Image.open(img_path)
    sizes.append((cat, img.size, img.mode))
    
print("{:<20} {:<15} {:<10}".format("Category", "Size", "Mode"))
print("-"*45)
for cat, size, mode in sizes:
    print("{:<20} {:<15} {:<10}".format(cat, str(size), mode))