# üöÄ LEGO Bricks ML Vision - Training Pipeline

## Environment-Agnostic Training Pipeline for YOLOv8 LEGO Detection Models

This notebook provides a complete training pipeline that works seamlessly across:
- üíª Local Windows development environment
- ‚òÅÔ∏è Kaggle notebooks
- üêß Linux/Unix systems

### Key Features

- **üîÑ Automatic Environment Detection**: Adapts to Windows/Kaggle/Linux
- **üìÇ Smart Repository Management**: Auto-clones if needed
- **üéØ Two-Stage Detection**: Trains both brick and stud detectors
- **üìä Enhanced Visualization**: Rich progress tracking and results display
- **üíæ Robust Data Handling**: Comprehensive dataset preparation and validation

### Prerequisites
- Python 3.8+
- Git (for repository cloning)
- Required packages will be automatically installed

Let's begin by setting up our environment! üöÄ

## 1. Environment Setup and Validation

First, we'll establish our execution environment and ensure all dependencies are in place.

In [None]:
import os
import sys
import logging
from pathlib import Path
import subprocess
import platform

# Configure rich logging with emojis
def setup_notebook_logging():
    """Configure rich logging with emoji support for better visibility."""
    logger = logging.getLogger("notebook_logger")
    if not logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.setLevel(logging.INFO)
        logger.addHandler(handler)
    return logger

logger = setup_notebook_logging()

def detect_environment():
    """Detect and validate execution environment."""
    env_info = {
        'system': platform.system(),
        'is_kaggle': 'KAGGLE_KERNEL_RUN_TYPE' in os.environ,
        'python_version': platform.python_version(),
        'git_available': shutil.which('git') is not None
    }
    
    logger.info(f"üîç Detected environment:")
    logger.info(f"   ‚Ä¢ System: {env_info['system']}")
    logger.info(f"   ‚Ä¢ Python: {env_info['python_version']}")
    logger.info(f"   ‚Ä¢ Kaggle: {'Yes' if env_info['is_kaggle'] else 'No'}")
    logger.info(f"   ‚Ä¢ Git: {'Available' if env_info['git_available'] else 'Not found'}")
    
    return env_info

def validate_repository_structure():
    """Validate or establish correct repository structure.
    
    1. Check if current directory is repo
    2. Check if parent is repo
    3. Check if grandparent is repo
    4. Clone repo if not found
    """
    REPO_NAME = "LEGO_Bricks_ML_Vision"
    REPO_URL = "https://github.com/MiguelDiLalla/LEGO_Bricks_ML_Vision.git"
    
    cwd = Path.cwd()
    parent = cwd.parent
    grandparent = parent.parent
    
    # Check all possible locations
    locations = [
        (cwd, "current directory"),
        (parent, "parent directory"),
        (grandparent, "grandparent directory")
    ]
    
    for path, desc in locations:
        if path.name == REPO_NAME:
            logger.info(f"‚úÖ Repository found in {desc}: {path}")
            os.chdir(path)  # Set CWD to repo root
            return path
    
    # Repository not found, need to clone
    logger.info(f"‚ö†Ô∏è Repository not found in directory tree. Cloning from GitHub...")
    try:
        # Clone to current directory
        subprocess.run(["git", "clone", REPO_URL], 
                     check=True, capture_output=True, text=True)
        
        repo_path = cwd / REPO_NAME
        os.chdir(repo_path)  # Set CWD to new repo
        logger.info(f"‚úÖ Repository cloned successfully to: {repo_path}")
        return repo_path
        
    except subprocess.CalledProcessError as e:
        logger.error(f"‚ùå Failed to clone repository: {e.stderr}")
        raise

# Execute environment setup
env_info = detect_environment()
repo_path = validate_repository_structure()

# Add repository root to Python path
if str(repo_path) not in sys.path:
    sys.path.append(str(repo_path))
    logger.info(f"‚úÖ Added repository root to Python path")

logger.info("üéâ Environment setup complete!")

In [None]:
# Install required packages
def install_requirements():
    """Install required packages from requirements.txt with progress tracking."""
    req_path = repo_path / "requirements.txt"
    
    if not req_path.exists():
        logger.error("‚ùå requirements.txt not found!")
        return False
    
    try:
        logger.info("üì¶ Installing required packages...")
        subprocess.run(["pip", "install", "-r", str(req_path), "--quiet"],
                     check=True, capture_output=True)
        logger.info("‚úÖ Package installation complete!")
        return True
    except subprocess.CalledProcessError as e:
        logger.error(f"‚ùå Failed to install requirements: {e.stderr}")
        return False

# Install requirements
install_success = install_requirements()

if install_success:
    # Import project modules
    from train import (
        setup_logging, get_repo_root, detect_hardware,
        unzip_dataset, validate_dataset, create_dataset_structure,
        dataset_split, augment_data, select_model, train_model,
        zip_and_download_results
    )
    
    # Import required libraries
    import torch
    import cv2
    import numpy as np
    import matplotlib.pyplot as plt
    from ultralytics import YOLO
    import albumentations as A
    from IPython.display import Image, display
    
    logger.info("‚úÖ All required modules imported successfully!")
else:
    logger.error("‚ùå Failed to set up environment. Please check the errors above.")

## 2. Hardware Detection and Training Setup

Now we'll detect available hardware and configure our training environment.

In [None]:
# Initialize training environment
setup_logging()  # Set up enhanced logging from train.py

# Detect optimal hardware
device = detect_hardware()
logger.info(f"üñ•Ô∏è Training will use device: {device}")

# Clean up any previous training sessions
cleanup_training_sessions(repo_path)
logger.info("üßπ Previous training sessions cleaned up")

## 3. Dataset Preparation

Let's prepare our datasets for both brick and stud detection models.

In [None]:
def prepare_datasets():
    """Prepare and validate both brick and stud detection datasets."""
    datasets = {}
    
    for mode in ['bricks', 'studs']:
        logger.info(f"\n{'='*20} Preparing {mode} dataset {'='*20}")
        
        # Extract dataset
        dataset_path = unzip_dataset(mode)
        
        # Validate structure
        images_path, labels_path = validate_dataset(mode)
        
        datasets[mode] = {
            'path': dataset_path,
            'images': images_path,
            'labels': labels_path
        }
        
        # Display statistics
        image_count = len(list(Path(images_path).glob("*.jpg")))
        label_count = len(list(Path(labels_path).glob("*.txt")))
        logger.info(f"üìä Dataset Statistics for {mode}:")
        logger.info(f"   ‚Ä¢ Images: {image_count}")
        logger.info(f"   ‚Ä¢ Labels: {label_count}")
    
    return datasets

# Prepare both datasets
prepared_datasets = prepare_datasets()

### 3.1 Dataset Visualization

Let's visualize some samples from our datasets to verify annotations.

In [None]:
def visualize_dataset_samples(datasets, num_samples=3):
    """Visualize sample images with annotations from both datasets.
    
    Args:
        datasets (dict): Dictionary containing dataset paths
        num_samples (int): Number of samples to display per dataset
    """
    for mode, paths in datasets.items():
        logger.info(f"\n{'='*20} {mode.capitalize()} Dataset Samples {'='*20}")
        
        # Get random samples
        image_files = list(Path(paths['images']).glob("*.jpg"))
        samples = random.sample(image_files, min(num_samples, len(image_files)))
        
        # Setup visualization
        fig, axes = plt.subplots(1, len(samples), figsize=(5*len(samples), 5))
        if len(samples) == 1:
            axes = [axes]
        
        # Define visualization colors
        colors = {
            'bricks': {'box': 'red', 'text': 'white', 'bg': 'darkred'},
            'studs': {'box': 'blue', 'text': 'white', 'bg': 'darkblue'}
        }
        
        for idx, (img_path, ax) in enumerate(zip(samples, axes)):
            # Load and display image
            img = cv2.imread(str(img_path))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            ax.imshow(img)
            
            # Load and display annotations
            label_path = Path(paths['labels']) / f"{img_path.stem}.txt"
            if label_path.exists():
                with open(label_path) as f:
                    for line in f:
                        cls_id, x, y, w, h = map(float, line.strip().split())
                        
                        # Convert YOLO coordinates to pixel coordinates
                        height, width = img.shape[:2]
                        x1 = int((x - w/2) * width)
                        y1 = int((y - h/2) * height)
                        x2 = int((x + w/2) * width)
                        y2 = int((y + h/2) * height)
                        
                        # Draw bounding box
                        rect = plt.Rectangle(
                            (x1, y1), x2-x1, y2-y1,
                            fill=False,
                            edgecolor=colors[mode]['box'],
                            linewidth=2
                        )
                        ax.add_patch(rect)
                        
                        # Add label
                        ax.text(
                            x1, y1-5,
                            f"{mode[:-1].capitalize()} {int(cls_id)}",
                            color=colors[mode]['text'],
                            bbox=dict(
                                facecolor=colors[mode]['bg'],
                                alpha=0.8,
                                edgecolor='none',
                                pad=1
                            )
                        )
            
            ax.set_title(f"{mode.capitalize()} Sample {idx+1}")
            ax.axis('off')
        
        plt.tight_layout()
        plt.show()

# Visualize samples from both datasets
visualize_dataset_samples(prepared_datasets)

### 3.2 Dataset Splitting and Augmentation

Now we'll split our datasets into train/val/test sets and apply augmentation to increase model robustness.

In [None]:
def process_datasets(datasets):
    """Split datasets and apply augmentation to training sets.
    
    Args:
        datasets (dict): Dictionary containing dataset paths
    
    Returns:
        dict: Processed dataset paths
    """
    processed_datasets = {}
    
    for mode, paths in datasets.items():
        logger.info(f"\n{'='*20} Processing {mode} dataset {'='*20}")
        
        # Create YOLO directory structure
        dataset_dir = create_dataset_structure(mode, repo_path)
        
        # Split dataset
        split_paths = dataset_split(mode, repo_path)
        
        # Apply augmentation to training set
        logger.info(f"üîÑ Applying augmentation to {mode} training set...")
        augment_data(dataset_dir, augmentations=2)
        
        processed_datasets[mode] = dataset_dir
        
    return processed_datasets

# Process both datasets
processed_datasets = process_datasets(prepared_datasets)

## 4. Model Training

### 4.1 Training Configuration

Let's set up our training parameters and initialize our models.

In [None]:
def configure_training(mode):
    """Configure training parameters for a specific mode.
    
    Args:
        mode (str): Either 'bricks' or 'studs'
        
    Returns:
        dict: Training configuration parameters
    """
    config = {
        'epochs': 50,
        'batch_size': 16,
        'device': device,
        'use_pretrained': True
    }
    
    # Select appropriate model
    model_path = select_model(mode, use_pretrained=config['use_pretrained'])
    config['model_path'] = model_path
    
    logger.info(f"‚öôÔ∏è Training configuration for {mode}:")
    for key, value in config.items():
        logger.info(f"   ‚Ä¢ {key}: {value}")
    
    return config

# Configure training for both modes
training_configs = {
    mode: configure_training(mode)
    for mode in ['bricks', 'studs']
}

### 4.2 Model Training

Now we'll train both the brick and stud detection models with progress tracking.

In [None]:
def train_models(processed_datasets, training_configs):
    """Train models for both brick and stud detection.
    
    Args:
        processed_datasets (dict): Paths to processed datasets
        training_configs (dict): Training configurations
        
    Returns:
        dict: Paths to training results
    """
    training_results = {}
    
    for mode in ['bricks', 'studs']:
        logger.info(f"\n{'='*20} Training {mode} detector {'='*20}")
        
        config = training_configs[mode]
        dataset_dir = processed_datasets[mode]
        
        # Train model
        results_dir = train_model(
            dataset_path=dataset_dir,
            model_path=config['model_path'],
            device=config['device'],
            epochs=config['epochs'],
            batch_size=config['batch_size'],
            repo_root=repo_path
        )
        
        training_results[mode] = results_dir
        logger.info(f"‚úÖ {mode} model training completed!")
    
    return training_results

# Train both models
training_results = train_models(processed_datasets, training_configs)

## 5. Results Export and Visualization

Finally, let's save our training results and visualize the model performance.

In [None]:
def export_training_results(training_results):
    """Export and archive training results for both models.
    
    Args:
        training_results (dict): Paths to training results directories
    """
    for mode, results_dir in training_results.items():
        logger.info(f"\n{'='*20} Exporting {mode} results {'='*20}")
        
        # Create timestamped filename
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f"{mode}_training_results_{timestamp}.zip"
        
        # Zip and provide download link
        zip_and_download_results(results_dir, output_filename)
        logger.info(f"‚úÖ {mode} results exported successfully!")

# Export results for both models
export_training_results(training_results)

logger.info("\nüéâ Training pipeline completed successfully! üéâ")

## 6. Training Results Analysis

Let's analyze the training results and visualize model performance metrics.

In [None]:
from train import display_last_training_session

# Display results for both models
for mode, results_dir in training_results.items():
    print(f"\n{'='*40}")
    print(f"Results Analysis for {mode.upper()} Detection Model")
    print(f"{'='*40}\n")
    display_last_training_session(results_dir)