In [None]:
# Cell 1: Setup and Installation
import numpy as np
import pandas as pd
import os

print("Exploring data directory structure...")
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Cell 2: Install Required Libraries
!pip install -U ultralytics

# Import required libraries
import json
import os
from sklearn.model_selection import train_test_split
import shutil
from ultralytics import YOLO
import cv2
import matplotlib.pyplot as plt
import warnings
from collections import defaultdict, Counter
warnings.filterwarnings('ignore')

# Verify installation
import ultralytics
ultralytics.checks()

print("All libraries imported successfully!")

In [None]:
# Cell 3: Examine TACO Dataset
# Load and examine the TACO annotations
json_path = './data/annotations.json'

with open(json_path, 'r') as f:
    data = json.load(f)

images = data['images']
annotations = data['annotations']
categories = data['categories']

print(f"📊 TACO Dataset Overview:")
print(f"   Total images: {len(images)}")
print(f"   Total annotations: {len(annotations)}")
print(f"   Total categories: {len(categories)}")
print(f"   Average annotations per image: {len(annotations)/len(images):.2f}")

# Show first few categories
print(f"\n🏷️  First 10 categories:")
for i, cat in enumerate(categories[:10]):
    print(f"   {cat['id']}: {cat['name']}")

# Show dataset distribution
print(f"\n📁 Image file structure (first 5):")
for img in images[:5]:
    print(f"   {img['file_name']} ({img['width']}x{img['height']})")

In [None]:
# Cell 3a: Count Available Images in Batch Folders
import os
from collections import defaultdict

def count_images_in_batches(data_dir='./data'):
    """Count actual images available in batch folders"""

    print("📁 Counting images in batch folders...")
    print("="*50)

    batch_counts = defaultdict(int)
    total_images = 0
    image_extensions = ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']

    # Look for batch folders
    batch_folders = []
    for item in os.listdir(data_dir):
        item_path = os.path.join(data_dir, item)
        if os.path.isdir(item_path) and 'batch' in item.lower():
            batch_folders.append(item)

    batch_folders.sort()  # Sort for consistent output

    if not batch_folders:
        print("⚠️  No batch folders found in ./data/")
        return 0, {}

    # Count images in each batch folder
    for batch_folder in batch_folders:
        batch_path = os.path.join(data_dir, batch_folder)

        # Count image files
        image_count = 0
        for file in os.listdir(batch_path):
            if any(file.endswith(ext) for ext in image_extensions):
                image_count += 1

        batch_counts[batch_folder] = image_count
        total_images += image_count

        print(f"   📂 {batch_folder}: {image_count:,} images")

    print("-" * 50)
    print(f"📊 TOTAL IMAGES AVAILABLE: {total_images:,}")

    return total_images, dict(batch_counts)

def compare_with_annotations(total_available, annotations):
    """Compare available images with annotation records"""

    print("\n🔍 ANNOTATION vs AVAILABLE COMPARISON:")
    print("="*50)

    annotated_count = len([img for img in annotations if 'file_name' in img])

    print(f"📄 Images in annotations.json: {annotated_count:,}")
    print(f"📁 Images in batch folders:    {total_available:,}")

    if total_available == annotated_count:
        print("✅ Perfect match! All annotated images are available")
        availability = 100.0
    elif total_available > annotated_count:
        extra = total_available - annotated_count
        print(f"📈 You have {extra:,} extra images (not in annotations)")
        availability = (annotated_count / total_available) * 100
    else:
        missing = annotated_count - total_available
        print(f"⚠️  {missing:,} annotated images are missing from folders")
        availability = (total_available / annotated_count) * 100

    print(f"📊 Availability rate: {availability:.1f}%")

    return availability

# Run the image counting
total_available, batch_counts = count_images_in_batches()

if total_available > 0:
    # Compare with annotations
    availability = compare_with_annotations(total_available, images)

    # Summary
    print(f"\n💡 SUMMARY:")
    if availability >= 95:
        print("   🎯 Excellent! Ready for training")
    elif availability >= 80:
        print("   ✅ Good availability, should work well")
    elif availability >= 60:
        print("   ⚠️  Some images missing, but usable")
    else:
        print("   ❌ Many images missing, check data setup")

    print(f"   📊 Dataset size: {total_available:,} images across {len(batch_counts)} batches")

    # Show largest batches
    if len(batch_counts) > 1:
        largest_batches = sorted(batch_counts.items(), key=lambda x: x[1], reverse=True)[:3]
        print(f"   📈 Largest batches: {', '.join([f'{b}({c})' for b, c in largest_batches])}")

In [None]:
# Cell 4: Define Class Mapping and Conversion Functions (FIXED VERSION)
def create_class_mapping():
    """Create mapping from original TACO classes to consolidated categories"""

    class_mapping = {
        # Bottle category
        'Clear plastic bottle': 'Bottle',
        'Glass bottle': 'Bottle',
        'Other plastic bottle': 'Bottle',

        # Bottle cap category
        'Plastic bottle cap': 'Bottle cap',
        'Metal bottle cap': 'Bottle cap',

        # Can category
        'Drink can': 'Can',
        'Food Can': 'Can',

        # Cigarette category
        'Cigarette': 'Cigarette',

        # Cup category
        'Paper cup': 'Cup',
        'Disposable plastic cup': 'Cup',
        'Foam cup': 'Cup',
        'Glass cup': 'Cup',
        'Other plastic cup': 'Cup',

        # Lid category
        'Plastic lid': 'Lid',
        'Metal lid': 'Lid',

        # Plastic bag + wrapper category
        'Garbage bag': 'Plastic bag + wrapper',
        'Single-use carrier bag': 'Plastic bag + wrapper',
        'Polypropylene bag': 'Plastic bag + wrapper',
        'Produce bag': 'Plastic bag + wrapper',
        'Cereal bag': 'Plastic bag + wrapper',
        'Bread bag': 'Plastic bag + wrapper',
        'Plastic film': 'Plastic bag + wrapper',
        'Crisp packet': 'Plastic bag + wrapper',
        'Other plastic wrapper': 'Plastic bag + wrapper',
        'Retort pouch': 'Plastic bag + wrapper',
        'Six pack rings': 'Plastic bag + wrapper',

        # Pop tab category
        'Pop tab': 'Pop tab',

        # Straw category
        'Plastic straw': 'Straw',
        'Paper straw': 'Straw',

        # Other category (everything else)
        'Aluminium foil': 'Other',
        'Battery': 'Other',
        'Aluminium blister pack': 'Other',
        'Carded blister pack': 'Other',
        'Broken glass': 'Other',
        'Corrugated carton': 'Other',
        'Drink carton': 'Other',
        'Egg carton': 'Other',
        'Meal carton': 'Other',
        'Other carton': 'Other',
        'Food waste': 'Other',
        'Magazine paper': 'Other',
        'Tissues': 'Other',
        'Wrapping paper': 'Other',
        'Normal paper': 'Other',
        'Paper bag': 'Other',
        'Plastified paper bag': 'Other',
        'Pizza box': 'Other',
        'Spread tub': 'Other',
        'Tupperware': 'Other',
        'Disposable food container': 'Other',
        'Foam food container': 'Other',
        'Other plastic container': 'Other',
        'Plastic gloves': 'Other',
        'Plastic glooves': 'Other',  # Handle the typo in your data
        'Plastic utensils': 'Other',
        'Rope & strings': 'Other',
        'Scrap metal': 'Other',
        'Shoe': 'Other',
        'Squeezable tube': 'Other',
        'Styrofoam piece': 'Other',
        'Toilet tube': 'Other',
        'Unlabeled litter': 'Other',
        'Glass jar': 'Other',
        'Other plastic': 'Other',
        'Aerosol': 'Other'  # Handle the unmapped class found in your data
    }

    return class_mapping

def analyze_class_distribution(annotations, categories, class_mapping):
    """Analyze class distribution before and after mapping"""

    print("📊 Class Distribution Analysis")
    print("="*50)

    # Count original classes
    original_counts = Counter()
    category_id_to_name = {cat['id']: cat['name'] for cat in categories}

    for ann in annotations:
        category_name = category_id_to_name[ann['category_id']]
        original_counts[category_name] += 1

    # Count mapped classes
    mapped_counts = Counter()
    for ann in annotations:
        category_name = category_id_to_name[ann['category_id']]
        mapped_category = class_mapping.get(category_name, 'Other')
        mapped_counts[mapped_category] += 1

    print(f"📈 Original classes: {len(original_counts)} classes")
    print(f"📈 Mapped classes: {len(mapped_counts)} classes")
    print(f"🔄 Reduction: {len(original_counts) - len(mapped_counts)} classes removed")

    print(f"\n🏷️  Mapped Class Distribution:")
    for category, count in sorted(mapped_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / sum(mapped_counts.values())) * 100
        print(f"   {category:25}: {count:5} annotations ({percentage:5.1f}%)")

    return original_counts, mapped_counts

def convert_taco_to_yolo_with_mapping_fixed(json_path, image_root_dir, output_dir):
    """
    Convert TACO dataset from COCO format to YOLO format with class mapping
    FIXED: Prevents filename collisions by including batch name in filename
    """

    print("🔄 Loading TACO annotations...")
    with open(json_path, 'r') as f:
        data = json.load(f)

    images = data['images']
    annotations = data['annotations']
    categories = data['categories']

    print(f"Found {len(images)} images, {len(annotations)} annotations, {len(categories)} categories")

    # Create class mapping
    class_mapping = create_class_mapping()

    # Analyze class distribution
    original_counts, mapped_counts = analyze_class_distribution(annotations, categories, class_mapping)

    # Create output directories
    print("\n📁 Creating output directories...")
    os.makedirs(os.path.join(output_dir, 'images/train'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'images/val'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'labels/train'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'labels/val'), exist_ok=True)

    # Split dataset into train/val
    image_ids = [img['id'] for img in images]
    train_ids, val_ids = train_test_split(image_ids, test_size=0.2, random_state=42)

    print(f"📊 Train set: {len(train_ids)} images")
    print(f"📊 Validation set: {len(val_ids)} images")

    # Create lookup dictionaries
    image_id_to_filename = {img['id']: img['file_name'] for img in images}
    category_id_to_name = {cat['id']: cat['name'] for cat in categories}

    # Create mapped categories and YOLO class mapping
    mapped_categories = sorted(list(set(class_mapping.values())))
    mapped_category_to_yolo_id = {cat: idx for idx, cat in enumerate(mapped_categories)}

    print(f"\n🏷️  Final mapped categories ({len(mapped_categories)}):")
    for i, cat in enumerate(mapped_categories):
        print(f"   {i}: {cat}")

    print("\n🔄 Converting images and annotations...")
    copied_count = 0
    missing_count = 0
    collision_count = 0
    conversion_stats = defaultdict(int)

    # Track renamed files for label creation
    old_to_new_filename = {}

    for i, img in enumerate(images):
        if i % 500 == 0:
            print(f"   Processed {i}/{len(images)} images...")

        img_id = img['id']
        filename = img['file_name']  # e.g., "batch_1/000000.jpg"
        img_width = img['width']
        img_height = img['height']

        # Determine train or val
        if img_id in train_ids:
            image_dir = os.path.join(output_dir, 'images/train')
            label_dir = os.path.join(output_dir, 'labels/train')
        else:
            image_dir = os.path.join(output_dir, 'images/val')
            label_dir = os.path.join(output_dir, 'labels/val')

        # Construct full image path
        full_image_path = os.path.join(image_root_dir, filename)

        # FIXED: Create unique filename to prevent collisions
        # Extract batch name and original filename
        if '/' in filename:
            batch_name, original_filename = filename.split('/', 1)
        else:
            batch_name = 'root'
            original_filename = filename

        # Create new unique filename: batch_name + "_" + original_filename
        # batch_1/000000.jpg → batch_1_000000.jpg
        new_filename = f"{batch_name}_{original_filename}"

        # Store mapping for label file creation
        old_to_new_filename[filename] = new_filename

        # Check for existing file with same new name (shouldn't happen, but safety check)
        destination_path = os.path.join(image_dir, new_filename)
        if os.path.exists(destination_path):
            collision_count += 1
            # Add image ID to make it unique
            name_part, ext = os.path.splitext(new_filename)
            new_filename = f"{name_part}_id{img_id}{ext}"
            destination_path = os.path.join(image_dir, new_filename)
            old_to_new_filename[filename] = new_filename

        if os.path.exists(full_image_path):
            # Copy image with new unique name
            shutil.copy(full_image_path, destination_path)
            copied_count += 1
        else:
            if missing_count < 10:
                print(f"⚠️  Warning: Image {full_image_path} not found.")
            missing_count += 1
            continue

        # Create label file with mapped classes
        label_filename = new_filename.replace('.jpg', '.txt').replace('.JPG', '.txt').replace('.jpeg', '.txt').replace('.png', '.txt')
        label_file = os.path.join(label_dir, label_filename)

        with open(label_file, 'w') as lf:
            for ann in annotations:
                if ann['image_id'] == img_id:
                    # Get original category name
                    original_category = category_id_to_name[ann['category_id']]

                    # Map to new category
                    mapped_category = class_mapping.get(original_category, 'Other')
                    yolo_class_id = mapped_category_to_yolo_id[mapped_category]

                    # Track conversion statistics
                    conversion_stats[f"{original_category} -> {mapped_category}"] += 1

                    bbox = ann['bbox']  # [x, y, width, height] in COCO format

                    # Convert to YOLO format (normalized center coordinates)
                    x_center = (bbox[0] + bbox[2] / 2) / img_width
                    y_center = (bbox[1] + bbox[3] / 2) / img_height
                    width = bbox[2] / img_width
                    height = bbox[3] / img_height

                    lf.write(f"{yolo_class_id} {x_center} {y_center} {width} {height}\n")

    print(f"\n✅ Successfully copied {copied_count} images")
    print(f"❌ Missing images: {missing_count}")
    if collision_count > 0:
        print(f"🔄 Resolved filename collisions: {collision_count}")

    return mapped_category_to_yolo_id, mapped_categories, class_mapping, conversion_stats

def create_data_yaml_with_mapping_fixed(output_dir, mapped_categories, mapped_category_to_yolo_id):
    """Create data.yaml file for YOLO training with mapped classes"""

    # Create class names list in YOLO order
    class_names = [""] * len(mapped_category_to_yolo_id)
    for category, yolo_id in mapped_category_to_yolo_id.items():
        class_names[yolo_id] = category

    yaml_content = f"""# TACO Dataset YOLO Configuration (Mapped Classes - Fixed Collisions)
train: {os.path.abspath(os.path.join(output_dir, 'images/train'))}
val: {os.path.abspath(os.path.join(output_dir, 'images/val'))}

nc: {len(class_names)}
names: {class_names}
"""

    yaml_path = os.path.join(output_dir, 'data.yaml')
    with open(yaml_path, 'w') as f:
        f.write(yaml_content)

    print(f"\n📄 Created {yaml_path}")
    print(f"📊 Number of mapped classes: {len(class_names)}")
    print("🏷️  Final class names:", class_names)

    return yaml_path

def save_mapping_info(output_dir, class_mapping, conversion_stats, mapped_counts):
    """Save class mapping information for reference"""

    mapping_info = {
        "class_mapping": class_mapping,
        "conversion_statistics": dict(conversion_stats),
        "final_class_distribution": dict(mapped_counts),
        "total_mapped_classes": len(set(class_mapping.values()))
    }

    mapping_path = os.path.join(output_dir, 'class_mapping_info.json')
    with open(mapping_path, 'w') as f:
        json.dump(mapping_info, f, indent=2)

    print(f"📋 Class mapping info saved to: {mapping_path}")

print("✅ FIXED class mapping and conversion functions defined!")
print("🔧 Filename collisions will be prevented by including batch names")

In [None]:
# Cell 5: Convert TACO to YOLO with Class Mapping (FIXED VERSION)
# Clean old output first
import shutil
if os.path.exists('./output'):
    shutil.rmtree('./output')
    print("🧹 Cleaned old output directory")

json_path = './data/annotations.json'
image_root_dir = './data'
output_dir = './output'

print("🚀 Starting TACO to YOLO conversion with class mapping (FIXED)...")
print(f"📁 JSON path: {json_path}")
print(f"📁 Image root: {image_root_dir}")
print(f"📁 Output directory: {output_dir}")

# Convert dataset with mapping (FIXED VERSION)
mapped_category_to_yolo_id, mapped_categories, class_mapping, conversion_stats = convert_taco_to_yolo_with_mapping_fixed(
    json_path, image_root_dir, output_dir
)

# Create data.yaml with mapped classes
yaml_path = create_data_yaml_with_mapping_fixed(output_dir, mapped_categories, mapped_category_to_yolo_id)

# Load original data for saving mapping info
with open(json_path, 'r') as f:
    data = json.load(f)

# Analyze final distribution
annotations = data['annotations']
categories = data['categories']
category_id_to_name = {cat['id']: cat['name'] for cat in categories}

mapped_counts = Counter()
for ann in annotations:
    category_name = category_id_to_name[ann['category_id']]
    mapped_category = class_mapping.get(category_name, 'Other')
    mapped_counts[mapped_category] += 1

# Save mapping information
save_mapping_info(output_dir, class_mapping, conversion_stats, mapped_counts)

# Store train/val split for later use
image_ids = [img['id'] for img in images]
train_ids, val_ids = train_test_split(image_ids, test_size=0.2, random_state=42)

print("\n" + "="*60)
print("🎉 Conversion with class mapping complete! (COLLISION-FREE)")
print(f"📁 YOLO dataset created in: {output_dir}")
print(f"📄 Configuration file: {yaml_path}")
print(f"🔄 Classes reduced from {len(categories)} to {len(mapped_categories)}")
print("="*60)

# Display the class mapping summary
print("\n📋 CLASS MAPPING SUMMARY:")
print(f"   Original TACO classes: {len(categories)} classes")
print(f"   Mapped to: {len(mapped_categories)} categories")
print(f"   Categories: {', '.join(mapped_categories)}")

# Check actual dataset size created
train_images = len([f for f in os.listdir('./output/images/train') if f.endswith(('.jpg', '.jpeg', '.png'))])
val_images = len([f for f in os.listdir('./output/images/val') if f.endswith(('.jpg', '.jpeg', '.png'))])


In [None]:
# Cell 6: Visualize Sample Images with Mapped Classes
import random

def visualize_yolo_sample_mapped(output_dir, num_samples=4):
    """Visualize sample images with YOLO annotations using mapped classes"""

    # Load class names
    yaml_path = os.path.join(output_dir, 'data.yaml')
    with open(yaml_path, 'r') as f:
        yaml_content = f.read()

    # Extract class names (simple parsing)
    names_line = [line for line in yaml_content.split('\n') if line.startswith('names:')][0]
    class_names = eval(names_line.split('names: ')[1])

    # Get sample training images
    train_img_dir = os.path.join(output_dir, 'images/train')
    train_label_dir = os.path.join(output_dir, 'labels/train')

    image_files = [f for f in os.listdir(train_img_dir) if f.endswith('.jpg')]
    sample_files = random.sample(image_files, min(num_samples, len(image_files)))

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.flatten()

    print(f"📊 Visualizing sample training data with {len(class_names)} mapped classes...")

    for i, img_file in enumerate(sample_files):
        # Load image
        img_path = os.path.join(train_img_dir, img_file)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Load annotations
        label_file = img_file.replace('.jpg', '.txt').replace('.JPG', '.txt')
        label_path = os.path.join(train_label_dir, label_file)

        h, w = image.shape[:2]

        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                annotations = f.readlines()

            # Draw bounding boxes
            colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0),
                     (255, 0, 255), (0, 255, 255), (128, 0, 128), (255, 165, 0),
                     (0, 128, 0), (128, 128, 128)]  # Different colors for each class

            for ann in annotations:
                class_id, x_center, y_center, width, height = map(float, ann.strip().split())
                class_id = int(class_id)

                # Convert from YOLO to pixel coordinates
                x1 = int((x_center - width/2) * w)
                y1 = int((y_center - height/2) * h)
                x2 = int((x_center + width/2) * w)
                y2 = int((y_center + height/2) * h)

                # Use different color for each class
                color = colors[class_id % len(colors)]

                # Draw rectangle
                cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)

                # Add label with class name
                label = class_names[class_id]
                cv2.putText(image, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)

        axes[i].imshow(image)
        axes[i].set_title(f'Sample {i+1}: {img_file}')
        axes[i].axis('off')

    plt.tight_layout()
    plt.show()

    print(f"🏷️  Mapped classes being detected: {class_names}")

# Visualize samples with mapped classes
visualize_yolo_sample_mapped('./output')

In [None]:
# Cell 7: Enhanced YOLO Training with Mapped Classes
print("🚀 Starting YOLO training with mapped classes...")

# Initialize YOLOv8 model with pretrained weights
model = YOLO('yolov8l.pt')  #large model

# Enhanced training configuration for mapped classes
results = model.train(
    data='./output/data.yaml',
    epochs=200,
    imgsz=800,
    batch=8,
    name='taco_detection_extended',  # New experiment name
    device=0,
)

print("🎉 Training complete!")
print(f"📁 Model saved in: ./runs/detect/taco_detection_mapped_fixed")

In [None]:
# Cell 8: Enhanced Model Evaluation (Train + Validation)
import yaml

# Load the trained model
model_path = './runs/detect/taco_detection_augmented/weights/best.pt'
model = YOLO(model_path)

print("📊 Comprehensive Model Evaluation (Train + Validation)")

# 1. Evaluate on VALIDATION set
print("\n" + "="*60)
print("🔍 VALIDATION SET PERFORMANCE")
print("="*60)

val_metrics = model.val(data='./output/data.yaml', split='val')

print(f"📈 Validation mAP50: {val_metrics.box.map50:.3f}")
print(f"📈 Validation mAP50-95: {val_metrics.box.map:.3f}")
print(f"🎯 Validation Precision: {val_metrics.box.mp:.3f}")
print(f"🎯 Validation Recall: {val_metrics.box.mr:.3f}")
print(f"⚡ Inference Speed: {val_metrics.speed['inference']:.1f}ms")

# 2. Evaluate on TRAINING set
print("\n" + "="*60)
print("🔍 TRAINING SET PERFORMANCE")
print("="*60)

# Create temporary data.yaml for training evaluation
with open('./output/data.yaml', 'r') as f:
    data_config = yaml.safe_load(f)

# Create modified config pointing to train set for both train and val
train_config = data_config.copy()
train_config['val'] = data_config['train']  # Point validation to train folder

# Save temporary config
temp_config_path = './output/data_train_eval.yaml'
with open(temp_config_path, 'w') as f:
    yaml.dump(train_config, f)

# Evaluate on training set
train_metrics = model.val(data=temp_config_path, split='val')  # Uses train data as "val"

print(f"📈 Training mAP50: {train_metrics.box.map50:.3f}")
print(f"📈 Training mAP50-95: {train_metrics.box.map:.3f}")
print(f"🎯 Training Precision: {train_metrics.box.mp:.3f}")
print(f"🎯 Training Recall: {train_metrics.box.mr:.3f}")

# Clean up temporary file
os.remove(temp_config_path)

# 3. Compare Train vs Validation Performance
print("\n" + "="*60)
print("📊 TRAIN vs VALIDATION COMPARISON")
print("="*60)

def performance_analysis(train_map50, val_map50, train_map, val_map):
    """Analyze overfitting based on train vs val performance"""

    map50_diff = train_map50 - val_map50
    map_diff = train_map - val_map

    print(f"📈 mAP50 Difference (Train - Val): {map50_diff:+.3f}")
    print(f"📈 mAP50-95 Difference (Train - Val): {map_diff:+.3f}")

    # Overfitting analysis
    if map50_diff > 0.15:
        print("⚠️  SEVERE OVERFITTING detected!")
        print("   → Model memorized training data")
        print("   → Consider: more augmentation, dropout, early stopping")
    elif map50_diff > 0.08:
        print("🟡 MODERATE OVERFITTING detected")
        print("   → Model performs much better on training data")
        print("   → Consider: more regularization, data augmentation")
    elif map50_diff > 0.03:
        print("🟢 SLIGHT OVERFITTING (normal)")
        print("   → Expected small gap between train and validation")
    elif map50_diff > -0.02:
        print("✅ WELL-BALANCED model")
        print("   → Good generalization, minimal overfitting")
    else:
        print("🔴 UNDERFITTING detected")
        print("   → Model performs better on validation than training")
        print("   → Consider: longer training, less regularization")

    return map50_diff, map_diff

# Perform analysis
map50_diff, map_diff = performance_analysis(
    train_metrics.box.map50, val_metrics.box.map50,
    train_metrics.box.map, val_metrics.box.map
)

# 4. Detailed Performance Breakdown
print(f"\n📋 DETAILED PERFORMANCE BREAKDOWN:")
print(f"{'Metric':<20} {'Training':<12} {'Validation':<12} {'Difference':<12}")
print("-" * 60)
print(f"{'mAP50':<20} {train_metrics.box.map50:<12.3f} {val_metrics.box.map50:<12.3f} {train_metrics.box.map50-val_metrics.box.map50:<+12.3f}")
print(f"{'mAP50-95':<20} {train_metrics.box.map:<12.3f} {val_metrics.box.map:<12.3f} {train_metrics.box.map-val_metrics.box.map:<+12.3f}")
print(f"{'Precision':<20} {train_metrics.box.mp:<12.3f} {val_metrics.box.mp:<12.3f} {train_metrics.box.mp-val_metrics.box.mp:<+12.3f}")
print(f"{'Recall':<20} {train_metrics.box.mr:<12.3f} {val_metrics.box.mr:<12.3f} {train_metrics.box.mr-val_metrics.box.mr:<+12.3f}")

# Enhanced performance interpretation for mapped classes
print(f"\n🎯 OVERALL ASSESSMENT:")
if val_metrics.box.map50 > 0.7:
    print("🌟 Excellent performance! Dataset fix worked amazingly!")
elif val_metrics.box.map50 > 0.5:
    print("✅ Good performance! Much better with full dataset!")
elif val_metrics.box.map50 > 0.3:
    print("⚠️  Moderate performance - improvement from dataset fix visible")
else:
    print("❌ Still low performance - may need more training or larger model")

print("\n🎉 Comprehensive evaluation complete!")

In [None]:
# Cell 9: Display Training Plots
from IPython.display import Image, display
import glob

print("📊 Training Results Visualization")

# Find result images
results_dir = './runs/detect/taco_detection_mapped_fixed'
plot_files = glob.glob(os.path.join(results_dir, '*.png'))

# Display key plots
key_plots = ['results.png', 'confusion_matrix.png', 'val_batch0_pred.png']

for plot_name in key_plots:
    plot_path = os.path.join(results_dir, plot_name)
    if os.path.exists(plot_path):
        print(f"\n📈 {plot_name.replace('_', ' ').title()}")
        display(Image(plot_path))
    else:
        print(f"⚠️  {plot_name} not found")

print(f"\n📁 All training results saved in: {results_dir}")

In [None]:
# Cell 10: Test on Validation Samples
def test_on_validation_samples(model, output_dir, num_samples=6):
    """Test model on sample validation images"""

    val_images_dir = os.path.join(output_dir, 'images/val')
    results_dir = './inference_results'
    os.makedirs(results_dir, exist_ok=True)

    # Get sample validation images
    val_images = [f for f in os.listdir(val_images_dir) if f.endswith('.jpg')]
    sample_images = random.sample(val_images, min(num_samples, len(val_images)))

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()

    print(f"🔍 Testing model on {len(sample_images)} validation images...")

    for i, img_file in enumerate(sample_images):
        img_path = os.path.join(val_images_dir, img_file)

        # Run inference
        results = model(img_path, conf=0.25)  # Confidence threshold

        # Get annotated image
        annotated_img = results[0].plot()
        annotated_img = cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB)

        # Count detections
        num_detections = len(results[0].boxes) if results[0].boxes is not None else 0

        # Display
        axes[i].imshow(annotated_img)
        axes[i].set_title(f'{img_file}\nDetections: {num_detections}')
        axes[i].axis('off')

        # Save annotated image
        output_path = os.path.join(results_dir, f'annotated_{img_file}')
        cv2.imwrite(output_path, cv2.cvtColor(annotated_img, cv2.COLOR_RGB2BGR))

        # Print detection details
        if num_detections > 0:
            print(f"📷 {img_file}: {num_detections} objects detected")
            for j, box in enumerate(results[0].boxes):
                class_id = int(box.cls)
                confidence = float(box.conf)
                class_name = model.names[class_id]
                print(f"   └─ {class_name}: {confidence:.2f}")
        else:
            print(f"📷 {img_file}: No objects detected")

    plt.tight_layout()
    plt.show()

    print(f"💾 Annotated images saved to: {results_dir}")

# Run validation test
test_on_validation_samples(model, './output')

In [None]:
# Cell 11: Test Custom Images Function


MODEL_PATH = "/home/cupo-ubuntu/Schreibtisch/ML4B/wastenet-website/backend/taco_model.pt"  # Update this path
model = YOLO(MODEL_PATH)


def test_custom_image(model, image_path):
    """Test model on a specific image"""

    if not os.path.exists(image_path):
        print(f"❌ Image not found: {image_path}")
        return

    print(f"🔍 Testing model on: {os.path.basename(image_path)}")

    # Run inference
    results = model(image_path, conf=0.25)

    # Load and display original image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Get annotated image
    annotated_img = results[0].plot()
    annotated_img = cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB)

    # Display side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    ax1.imshow(image)
    ax1.set_title('Original Image')
    ax1.axis('off')

    ax2.imshow(annotated_img)
    ax2.set_title('YOLO Detection Results')
    ax2.axis('off')

    plt.tight_layout()
    plt.show()

    # Print detection details
    num_detections = len(results[0].boxes) if results[0].boxes is not None else 0
    print(f"🎯 Found {num_detections} objects:")

    if num_detections > 0:
        for i, box in enumerate(results[0].boxes):
            class_id = int(box.cls)
            confidence = float(box.conf)
            class_name = model.names[class_id]
            print(f"   {i+1}. {class_name} (confidence: {confidence:.2f})")

    return results

# Example usage (uncomment and modify path to test custom images):
test_custom_image(model, '/home/cupo-ubuntu/Schreibtisch/ML4B/wastenet-model/github/data/batch_1/000111.JPG')

print("💡 To test custom images, use: test_custom_image(model, 'path/to/image.jpg')")

In [None]:
# Cell 12: Export Models for Deployment (SIMPLE FIX)




print("📦 Exporting model for deployment...")

# SIMPLE FIX: Move model to CPU first to avoid GPU export issues
model.to('cpu')

export_dir = './exported_models'
os.makedirs(export_dir, exist_ok=True)

# Export to ONNX (for React website)
try:
    onnx_path = model.export(format='onnx', imgsz=640, half=False, simplify=False)
    print(f"✅ ONNX model exported: {onnx_path}")
except Exception as e:
    print(f"❌ ONNX export failed: {e}")

# Export to TensorFlow Lite (for Android)
try:
    tflite_path = model.export(format='tflite', imgsz=640, half=False)
    print(f"✅ TensorFlow Lite model exported: {tflite_path}")
except Exception as e:
    print(f"❌ TFLite export failed: {e}")

# Keep PyTorch model (for server deployment)
import shutil
pt_dest = os.path.join(export_dir, 'taco_model.pt')
shutil.copy(model_path, pt_dest)
print(f"✅ PyTorch model copied: {pt_dest}")

print("\n📋 Model Export Summary:")
print(f"🌐 React website: Use ONNX model")
print(f"📱 Android app: Use TFLite model")
print(f"🖥️  Server deployment: Use PyTorch model")

In [None]:
# Cell 13: Final Training Summary and Report
from datetime import datetime

# Collect training information
training_summary = {
    "timestamp": datetime.now().isoformat(),
    "dataset": {
        "name": "TACO",
        "total_images": len(images),
        "total_annotations": len(annotations),
        "original_classes": len(categories),
        "mapped_classes": len(mapped_categories),
        "train_images": train_images,
        "val_images": val_images,
        "images_used": train_images + val_images,
        "usage_rate": f"{((train_images + val_images)/len(images))*100:.1f}%"
    },
    "class_mapping": {
        "original_to_mapped": class_mapping,
        "mapped_categories": mapped_categories,
        "reduction_ratio": f"{len(categories)} → {len(mapped_categories)} ({((len(categories) - len(mapped_categories))/len(categories)*100):.1f}% reduction)"
    },
    "model": {
        "architecture": "YOLOv8n",
        "pretrained": True,
        "epochs": 100,
        "image_size": 640,
        "batch_size": 16
    },
    "performance": {
        "val_mAP50": float(val_metrics.box.map50),
        "val_mAP50_95": float(val_metrics.box.map),
        "val_precision": float(val_metrics.box.mp),
        "val_recall": float(val_metrics.box.mr),
        "train_mAP50": float(train_metrics.box.map50),
        "train_mAP50_95": float(train_metrics.box.map),
        "overfitting_gap": float(map50_diff),
        "inference_speed_ms": float(val_metrics.speed['inference'])
    },
    "fixes_applied": {
        "filename_collision_fix": True,
        "class_mapping": True,
        "dataset_usage_improvement": f"From 498 to {train_images + val_images} images"
    },
    "files": {
        "model_path": model_path,
        "config_path": './output/data.yaml',
        "results_dir": results_dir,
        "mapping_info": './output/class_mapping_info.json'
    }
}

# Save summary
summary_path = './training_summary.json'
with open(summary_path, 'w') as f:
    json.dump(training_summary, f, indent=2)

print("📊 TRAINING COMPLETED SUCCESSFULLY! 🎉")
print("="*70)
print("📋 FINAL SUMMARY:")
print(f"   🎯 Final Validation mAP50: {val_metrics.box.map50:.3f}")
print(f"   📈 Training mAP50: {train_metrics.box.map50:.3f}")
print(f"   🔄 Overfitting Gap: {map50_diff:+.3f}")
print(f"   ⚡ Inference Speed: {val_metrics.speed['inference']:.1f}ms")
print(f"   📉 Class Reduction: {len(categories)} → {len(mapped_categories)} classes")
print(f"   📊 Dataset Usage: {train_images + val_images:,}/{len(images):,} images ({((train_images + val_images)/len(images))*100:.1f}%)")
print(f"   📁 Model Location: {model_path}")
print(f"   📄 Summary Report: {summary_path}")
print("="*70)
print("🚀 Your COLLISION-FIXED TACO object detection model is ready!")

# Display final class distribution
print(f"\n🏷️  FINAL MAPPED CLASSES ({len(mapped_categories)} total):")
for i, cat in enumerate(mapped_categories):
    yolo_id = mapped_category_to_yolo_id[cat]
    count = mapped_counts[cat]
    percentage = (count / sum(mapped_counts.values())) * 100
    print(f"   {yolo_id}: {cat:25} - {count:4} samples ({percentage:5.1f}%)")

print(f"\n💡 FIXES APPLIED SUCCESSFULLY:")
print(f"   ✅ Filename collision prevention (batch names included)")
print(f"   ✅ Full dataset utilization ({train_images + val_images:,} images)")
print(f"   ✅ Class mapping and balance improvement")
print(f"   ✅ Enhanced training configuration")
print(f"   ✅ Comprehensive evaluation (train + validation)")

improvement_factor = (train_images + val_images) / 498
print(f"\n🚀 PERFORMANCE IMPROVEMENT EXPECTED:")
print(f"   📈 {improvement_factor:.1f}x more training data")
print(f"   🎯 Should see significant mAP improvement")
print(f"   ⚡ Better generalization to new images")
print(f"   🔧 Ready for real-world deployment!")