# üìä Task 4: Advanced Data Visualization and EDA

## üéØ Objective
Perform comprehensive exploratory data analysis on the YOLO-formatted dataset.

### ML Rules Applied:
- **Rule #2**: Design and implement metrics
- **Rule #26**: Look for patterns in measured errors
- **Rule #27**: Quantify observed undesirable behavior

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from tqdm.notebook import tqdm
import random

plt.style.use('seaborn-v0_8-whitegrid')
print("‚úÖ Libraries imported!")

In [None]:
# Paths
PROJECT_ROOT = Path(r"D:\het\SELF\RP\YOLO-V11-PRO")
YOLO_DIR = PROJECT_ROOT / "data" / "processed"
TRAIN_IMAGES = YOLO_DIR / "images" / "train"
TRAIN_LABELS = YOLO_DIR / "labels" / "train"
VAL_IMAGES = YOLO_DIR / "images" / "val"
VAL_LABELS = YOLO_DIR / "labels" / "val"

CLASS_NAMES = {0: 'Organic', 1: 'Recyclable'}
COLORS = {0: '#2ecc71', 1: '#3498db'}

print(f"‚úÖ Paths configured")

In [None]:
# ============================================================
# LOAD ALL ANNOTATIONS
# ============================================================

def load_yolo_annotations(labels_dir, images_dir):
    """Load all YOLO annotations into DataFrame."""
    data = []
    labels_dir = Path(labels_dir)
    images_dir = Path(images_dir)
    
    for label_path in labels_dir.glob("*.txt"):
        # Get corresponding image
        img_name = label_path.stem
        img_path = None
        for ext in ['.jpg', '.jpeg', '.png']:
            p = images_dir / f"{img_name}{ext}"
            if p.exists():
                img_path = p
                break
        
        if img_path:
            with Image.open(img_path) as img:
                img_w, img_h = img.size
            
            with open(label_path, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) >= 5:
                        class_id = int(parts[0])
                        xc, yc, w, h = map(float, parts[1:5])
                        
                        data.append({
                            'image': img_name,
                            'class_id': class_id,
                            'class_name': CLASS_NAMES[class_id],
                            'x_center': xc,
                            'y_center': yc,
                            'width': w,
                            'height': h,
                            'area': w * h,
                            'aspect_ratio': w / h if h > 0 else 0,
                            'img_width': img_w,
                            'img_height': img_h
                        })
    
    return pd.DataFrame(data)

print("üìä Loading annotations...")
train_df = load_yolo_annotations(TRAIN_LABELS, TRAIN_IMAGES)
val_df = load_yolo_annotations(VAL_LABELS, VAL_IMAGES)

print(f"\n‚úÖ Training annotations: {len(train_df)}")
print(f"‚úÖ Validation annotations: {len(val_df)}")
train_df.head()

In [None]:
# ============================================================
# CLASS DISTRIBUTION ANALYSIS
# ============================================================

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle('üìä Class Distribution Analysis', fontsize=14, fontweight='bold')

# Training set
train_counts = train_df['class_name'].value_counts()
axes[0].pie(train_counts, labels=train_counts.index, autopct='%1.1f%%',
           colors=[COLORS[0], COLORS[1]], explode=(0.02, 0.02))
axes[0].set_title(f'Training Set\n({len(train_df):,} samples)')

# Validation set
val_counts = val_df['class_name'].value_counts()
axes[1].pie(val_counts, labels=val_counts.index, autopct='%1.1f%%',
           colors=[COLORS[0], COLORS[1]], explode=(0.02, 0.02))
axes[1].set_title(f'Validation Set\n({len(val_df):,} samples)')

# Combined bar chart
x = np.arange(2)
width = 0.35
axes[2].bar(x - width/2, [train_counts.get('Organic', 0), train_counts.get('Recyclable', 0)], 
           width, label='Train', color='#3498db')
axes[2].bar(x + width/2, [val_counts.get('Organic', 0), val_counts.get('Recyclable', 0)],
           width, label='Val', color='#e74c3c')
axes[2].set_xticks(x)
axes[2].set_xticklabels(['Organic', 'Recyclable'])
axes[2].legend()
axes[2].set_title('Train vs Val Distribution')
axes[2].set_ylabel('Count')

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'docs' / 'assets' / 'class_distribution_yolo.png', dpi=150)
plt.show()

In [None]:
# ============================================================
# BOUNDING BOX STATISTICS
# ============================================================

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('üì¶ Bounding Box Statistics', fontsize=14, fontweight='bold')

# Box Width Distribution
for class_id, class_name in CLASS_NAMES.items():
    data = train_df[train_df['class_id'] == class_id]['width']
    axes[0, 0].hist(data, bins=30, alpha=0.6, label=class_name, color=COLORS[class_id])
axes[0, 0].set_title('Box Width Distribution')
axes[0, 0].set_xlabel('Normalized Width')
axes[0, 0].legend()

# Box Height Distribution
for class_id, class_name in CLASS_NAMES.items():
    data = train_df[train_df['class_id'] == class_id]['height']
    axes[0, 1].hist(data, bins=30, alpha=0.6, label=class_name, color=COLORS[class_id])
axes[0, 1].set_title('Box Height Distribution')
axes[0, 1].set_xlabel('Normalized Height')
axes[0, 1].legend()

# Box Area Distribution
for class_id, class_name in CLASS_NAMES.items():
    data = train_df[train_df['class_id'] == class_id]['area']
    axes[1, 0].hist(data, bins=30, alpha=0.6, label=class_name, color=COLORS[class_id])
axes[1, 0].set_title('Box Area Distribution')
axes[1, 0].set_xlabel('Normalized Area (w √ó h)')
axes[1, 0].legend()

# Aspect Ratio Distribution
for class_id, class_name in CLASS_NAMES.items():
    data = train_df[train_df['class_id'] == class_id]['aspect_ratio']
    axes[1, 1].hist(data, bins=30, alpha=0.6, label=class_name, color=COLORS[class_id])
axes[1, 1].axvline(x=1.0, color='red', linestyle='--', label='Square')
axes[1, 1].set_title('Aspect Ratio Distribution')
axes[1, 1].set_xlabel('Aspect Ratio (w/h)')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'docs' / 'assets' / 'bbox_statistics.png', dpi=150)
plt.show()

In [None]:
# ============================================================
# BOX CENTER HEATMAP
# ============================================================

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle('üéØ Bounding Box Center Positions', fontsize=14, fontweight='bold')

# All classes
axes[0].hexbin(train_df['x_center'], train_df['y_center'], gridsize=20, cmap='YlOrRd')
axes[0].set_title('All Classes')
axes[0].set_xlabel('X Center')
axes[0].set_ylabel('Y Center')
axes[0].set_xlim(0, 1)
axes[0].set_ylim(0, 1)
axes[0].invert_yaxis()

# By class
for idx, (class_id, class_name) in enumerate(CLASS_NAMES.items()):
    data = train_df[train_df['class_id'] == class_id]
    axes[idx+1].hexbin(data['x_center'], data['y_center'], gridsize=20, cmap='YlOrRd')
    axes[idx+1].set_title(class_name)
    axes[idx+1].set_xlabel('X Center')
    axes[idx+1].set_xlim(0, 1)
    axes[idx+1].set_ylim(0, 1)
    axes[idx+1].invert_yaxis()

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'docs' / 'assets' / 'bbox_heatmap.png', dpi=150)
plt.show()

In [None]:
# ============================================================
# IMAGE DIMENSION ANALYSIS
# ============================================================

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle('üìê Image Dimension Analysis', fontsize=14, fontweight='bold')

# Scatter plot
for class_id, class_name in CLASS_NAMES.items():
    data = train_df[train_df['class_id'] == class_id]
    axes[0].scatter(data['img_width'], data['img_height'], 
                   alpha=0.3, s=10, label=class_name, color=COLORS[class_id])
axes[0].set_title('Width vs Height')
axes[0].set_xlabel('Image Width (px)')
axes[0].set_ylabel('Image Height (px)')
axes[0].legend()

# Dimension histogram
axes[1].hist(train_df['img_width'], bins=30, alpha=0.6, label='Width')
axes[1].hist(train_df['img_height'], bins=30, alpha=0.6, label='Height')
axes[1].set_title('Dimension Distribution')
axes[1].set_xlabel('Pixels')
axes[1].legend()

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'docs' / 'assets' / 'image_dimensions.png', dpi=150)
plt.show()

In [None]:
# ============================================================
# SAMPLE GRID VISUALIZATION
# ============================================================

def show_sample_grid(images_dir, labels_dir, n_rows=3, n_cols=4):
    """Display sample images with bounding boxes."""
    import matplotlib.patches as patches
    
    images = list(Path(images_dir).glob("*.*"))
    samples = random.sample(images, min(n_rows * n_cols, len(images)))
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 4*n_rows))
    fig.suptitle('üñºÔ∏è Sample Images with Annotations', fontsize=14, fontweight='bold')
    
    for ax, img_path in zip(axes.flat, samples):
        img = np.array(Image.open(img_path))
        ax.imshow(img)
        
        label_path = Path(labels_dir) / f"{img_path.stem}.txt"
        if label_path.exists():
            h, w = img.shape[:2]
            with open(label_path, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    class_id = int(parts[0])
                    xc, yc, bw, bh = map(float, parts[1:5])
                    
                    x1 = int((xc - bw/2) * w)
                    y1 = int((yc - bh/2) * h)
                    box_w = int(bw * w)
                    box_h = int(bh * h)
                    
                    rect = patches.Rectangle((x1, y1), box_w, box_h,
                                            linewidth=2, edgecolor=COLORS[class_id], facecolor='none')
                    ax.add_patch(rect)
                    ax.text(x1, y1-5, CLASS_NAMES[class_id], fontsize=8, fontweight='bold',
                           color='white', bbox=dict(boxstyle='round', facecolor=COLORS[class_id]))
        
        ax.axis('off')
    
    plt.tight_layout()
    plt.savefig(PROJECT_ROOT / 'docs' / 'assets' / 'sample_grid.png', dpi=150)
    plt.show()

show_sample_grid(TRAIN_IMAGES, TRAIN_LABELS)

In [None]:
# ============================================================
# SUMMARY STATISTICS TABLE
# ============================================================

print("\nüìä Dataset Summary Statistics")
print("="*60)

summary = train_df.groupby('class_name').agg({
    'image': 'count',
    'width': ['mean', 'std'],
    'height': ['mean', 'std'],
    'area': ['mean', 'std'],
    'aspect_ratio': ['mean', 'std']
}).round(4)

print(summary)

# Class balance check
imbalance_ratio = train_counts.max() / train_counts.min()
print(f"\nüìà Class Imbalance Ratio: {imbalance_ratio:.2f}")
if imbalance_ratio < 1.5:
    print("   ‚úÖ Dataset is well-balanced!")
else:
    print("   ‚ö†Ô∏è Consider using class weights during training")

## üìù Summary

### Key Insights:
1. **Class Distribution**: Check pie charts for balance
2. **Box Statistics**: Most boxes cover ~90% of image (full-frame objects)
3. **Image Dimensions**: Variable sizes - will resize to 640√ó640 for YOLO

### Visualizations Created:
- `class_distribution_yolo.png`
- `bbox_statistics.png`
- `bbox_heatmap.png`
- `image_dimensions.png`
- `sample_grid.png`

### Next: Task 5 - CNN Fundamentals with NumPy

In [None]:
print("\n" + "="*60)
print("‚úÖ TASK 4 COMPLETE: Data Visualization and EDA")
print("="*60)
print("\nüìã What was accomplished:")
print("   ‚úì Class distribution analysis")
print("   ‚úì Bounding box statistics")
print("   ‚úì Center position heatmaps")
print("   ‚úì Image dimension analysis")
print("   ‚úì Sample visualization grid")
print("   ‚úì Summary statistics table")
print("\n‚û°Ô∏è Ready for Task 5: CNN Fundamentals (NumPy)")