In [None]:
# Include the needed libraries
import tensorflow as tf
import tensorflow_datasets as tfds
import time
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import layers, models, losses, optimizers, metrics
import os

# Check for accelerator (GPU or TPU)
accelerator_found = False
using_tpu = False
strategy = None

# Check for TPU first
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # Will detect TPU in Colab
    print(f"TPU detected: {tpu.cluster_spec().as_dict()}")
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    using_tpu = True
    accelerator_found = True
    print(f"TPU initialized with {strategy.num_replicas_in_sync} replicas")
except (ValueError, AttributeError, ImportError):
    # Check for GPU
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        accelerator_found = True
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
        print(f"GPU detected: {gpus[0].name}")
        # Set memory growth to avoid OOM
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    else:
        # Use CPU
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
        print("No accelerator detected, using CPU")

if not accelerator_found:
    print("Warning: No accelerator (GPU or TPU) detected! Using CPU.")

# CREATE A CNN MODEL
def create_cnn_model(num_classes=10):
    """Create a simple CNN model for classification."""
    model = models.Sequential([
        # First convolutional block
        layers.Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(32, 32, 3)),
        layers.MaxPooling2D((2, 2)),
        
        # Second convolutional block
        layers.Conv2D(64, (3, 3), padding='same', activation='relu'),
        layers.MaxPooling2D((2, 2)),
        
        # Flatten and dense layers
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

def load_and_prepare_cifar10(batch_size=32, train_fraction=0.1, val_fraction=0.1):
    """
    Load and prepare a small portion of CIFAR-10 dataset.
    
    Args:
        batch_size: Batch size for datasets
        train_fraction: Fraction of training data to use (0.0 to 1.0)
        val_fraction: Fraction of training data to use for validation (0.0 to 1.0)
    
    Returns:
        train_ds, val_ds, test_ds: tf.data.Dataset objects
    """
    
    # Define preprocessing function
    def preprocess(image, label):
        # Convert image to float32 and normalize to [0,1]
        image = tf.cast(image, tf.float32) / 255.0
        # Normalize with CIFAR-10 mean and std
        mean = tf.constant([0.4914, 0.4822, 0.4465])
        std = tf.constant([0.2470, 0.2435, 0.2616])
        image = (image - mean) / std
        return image, label
    
    # Load full CIFAR-10 dataset
    train_ds_full, test_ds = tfds.load(
        'cifar10',
        split=['train', 'test'],
        as_supervised=True,
        batch_size=-1  # Load all data at once for splitting
    )
    
    # Convert to numpy for easier subset creation
    train_images, train_labels = tfds.as_numpy(train_ds_full)
    test_images, test_labels = tfds.as_numpy(test_ds)
    
    # Calculate sizes for training subset
    total_train = len(train_images)
    train_size = int(total_train * train_fraction)
    val_size = int(total_train * val_fraction)
    
    # Randomly shuffle indices
    indices = np.random.permutation(total_train)
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    
    # Create subsets
    train_images_subset = train_images[train_indices]
    train_labels_subset = train_labels[train_indices]
    val_images = train_images[val_indices]
    val_labels = train_labels[val_indices]
    
    print(f"Training samples: {len(train_images_subset)}")
    print(f"Validation samples: {len(val_images)}")
    print(f"Test samples: {len(test_images)}")
    
    # Create tf.data.Dataset objects
    train_ds = tf.data.Dataset.from_tensor_slices((train_images_subset, train_labels_subset))
    val_ds = tf.data.Dataset.from_tensor_slices((val_images, val_labels))
    test_ds = tf.data.Dataset.from_tensor_slices((test_images, test_labels))
    
    # Apply preprocessing and batching
    train_ds = train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    val_ds = val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    test_ds = test_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Batch and prefetch
    train_ds = train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    val_ds = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return train_ds, val_ds, test_ds

def train_with_strategy(strategy, train_ds, val_ds, num_epochs=5):
    """Train the model using the specified distribution strategy."""
    
    print(f"Training with strategy: {strategy}")
    
    with strategy.scope():
        # Create model
        model = create_cnn_model(num_classes=10)
        
        # Compile model
        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.001),
            loss=losses.SparseCategoricalCrossentropy(from_logits=False),
            metrics=['accuracy']
        )
        
        # Print model summary
        model.summary()
    
    # Initialize timing variables
    total_training_time = 0
    total_samples_processed = 0
    epoch_metrics = []
    
    # Custom training loop for timing
    for epoch in range(num_epochs):
        # Start timing
        start_time = time.time()
        
        # Train for one epoch
        history = model.fit(
            train_ds,
            epochs=1,
            validation_data=val_ds,
            verbose=0
        )
        
        # End timing
        epoch_time = time.time() - start_time
        
        # Calculate samples processed
        samples_processed = sum(1 for _ in train_ds.unbatch().batch(1))
        
        # Update totals
        total_training_time += epoch_time
        total_samples_processed += samples_processed
        
        # Calculate samples per second
        samples_per_second = samples_processed / epoch_time
        
        # Store metrics
        epoch_metrics.append({
            'epoch': epoch + 1,
            'train_loss': history.history['loss'][0],
            'train_acc': history.history['accuracy'][0],
            'val_loss': history.history['val_loss'][0],
            'val_acc': history.history['val_accuracy'][0],
            'epoch_time': epoch_time,
            'samples_per_second': samples_per_second
        })
        
        print(f'Epoch {epoch + 1}/{num_epochs}:')
        print(f'Train Loss: {history.history["loss"][0]:.4f}, Train Acc: {history.history["accuracy"][0]:.4f}')
        print(f'Val Loss: {history.history["val_loss"][0]:.4f}, Val Acc: {history.history["val_accuracy"][0]:.4f}')
        print(f'Epoch Time: {epoch_time:.2f}s, Samples/sec: {samples_per_second:.2f}')
        print('-' * 50)
    
    # Calculate overall metrics
    avg_samples_per_second = total_samples_processed / total_training_time
    final_val_acc = epoch_metrics[-1]['val_acc']
    
    print(f"\nOverall Training Summary:")
    print(f"Total Training Time: {total_training_time:.2f}s")
    print(f"Average Samples/second: {avg_samples_per_second:.2f}")
    print(f"Final Validation Accuracy: {final_val_acc:.4f}")
    
    return model, total_training_time, avg_samples_per_second, final_val_acc

def test_model(model, test_ds):
    """Test the model."""
    start_time = time.time()
    test_loss, test_acc = model.evaluate(test_ds, verbose=0)
    test_time = time.time() - start_time
    
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')
    print(f'Test Time: {test_time:.2f}s')
    return test_loss, test_acc

def profile_inference(models_dict, test_ds, batch_sizes=[1, 32, 64], num_iterations=100):
    """
    Profile inference performance on CPU and accelerator for different batch sizes.
    
    Args:
        models_dict: Dictionary with keys 'cpu' and 'accelerator' containing trained models
        test_ds: tf.data.Dataset for test dataset
        batch_sizes: List of batch sizes to profile
        num_iterations: Number of inference iterations for accurate timing
    
    Returns:
        Dictionary with profiling results
    """
    results = {
        'cpu': {},
        'accelerator': {}
    }
    
    # Get a fixed batch from test dataset
    for batch in test_ds.take(1):
        fixed_inputs, _ = batch
    
    for device_name, model in models_dict.items():
        print(f"\nProfiling inference on {device_name.upper()}...")
        
        for batch_size in batch_sizes:
            # Prepare batch
            if batch_size <= fixed_inputs.shape[0]:
                inputs = fixed_inputs[:batch_size]
            else:
                # Repeat to create larger batch
                repeats = (batch_size + fixed_inputs.shape[0] - 1) // fixed_inputs.shape[0]
                inputs = tf.tile(fixed_inputs, [repeats, 1, 1, 1])[:batch_size]
            
            # Warm-up
            for _ in range(10):
                _ = model.predict(inputs, verbose=0)
            
            # Timed inference
            start_time = time.time()
            for _ in range(num_iterations):
                _ = model.predict(inputs, verbose=0)
            
            # Ensure all operations are complete
            if tf.config.list_physical_devices('GPU'):
                tf.keras.backend.clear_session()
            
            end_time = time.time()
            total_time_seconds = end_time - start_time
            total_time_ms = total_time_seconds * 1000
            
            # Calculate metrics
            avg_time_per_batch_ms = total_time_ms / num_iterations
            samples_per_second = (batch_size * num_iterations) / total_time_seconds
            
            results[device_name][batch_size] = {
                'avg_inference_time_ms': avg_time_per_batch_ms,
                'samples_per_second': samples_per_second,
                'total_time_seconds': total_time_seconds,
                'num_iterations': num_iterations
            }
            
            print(f"  Batch size {batch_size}:")
            print(f"    Avg inference time: {avg_time_per_batch_ms:.2f} ms")
            print(f"    Samples/second: {samples_per_second:.2f}")
    
    # Print comparison summary
    print("\n" + "="*60)
    print("INFERENCE PROFILING SUMMARY")
    print("="*60)
    print(f"{'Batch Size':<12} {'Metric':<20} {'CPU':<15} {'Accelerator':<15} {'Speedup':<10}")
    print("-"*60)
    
    for batch_size in batch_sizes:
        cpu_time = results['cpu'][batch_size]['avg_inference_time_ms']
        acc_time = results['accelerator'][batch_size]['avg_inference_time_ms']
        speedup_time = cpu_time / acc_time if acc_time > 0 else float('inf')
        
        cpu_throughput = results['cpu'][batch_size]['samples_per_second']
        acc_throughput = results['accelerator'][batch_size]['samples_per_second']
        speedup_throughput = acc_throughput / cpu_throughput if cpu_throughput > 0 else float('inf')
        
        print(f"{batch_size:<12} {'Time (ms)':<20} {cpu_time:<15.2f} {acc_time:<15.2f} {speedup_time:<10.2f}x")
        print(f"{batch_size:<12} {'Samples/sec':<20} {cpu_throughput:<15.2f} {acc_throughput:<15.2f} {speedup_throughput:<10.2f}x")
        print("-"*60)
    
    return results

def plot_inference_results(profile_results):
    """
    Plot inference profiling results comparing CPU vs Accelerator.
    
    Args:
        profile_results: Dictionary returned by profile_inference() function
    """
    batch_sizes = list(profile_results['cpu'].keys())
    
    # Extract data
    cpu_times = [profile_results['cpu'][bs]['avg_inference_time_ms'] for bs in batch_sizes]
    acc_times = [profile_results['accelerator'][bs]['avg_inference_time_ms'] for bs in batch_sizes]
    
    cpu_throughput = [profile_results['cpu'][bs]['samples_per_second'] for bs in batch_sizes]
    acc_throughput = [profile_results['accelerator'][bs]['samples_per_second'] for bs in batch_sizes]
    
    # Calculate speedups
    time_speedups = [cpu_times[i] / acc_times[i] for i in range(len(batch_sizes))]
    throughput_speedups = [acc_throughput[i] / cpu_throughput[i] for i in range(len(batch_sizes))]
    
    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    fig.suptitle('Inference Performance: CPU vs Accelerator', fontsize=16, fontweight='bold')
    
    # Plot 1: Inference Time (bar chart)
    x = np.arange(len(batch_sizes))
    width = 0.35
    
    bars1 = ax1.bar(x - width/2, cpu_times, width, label='CPU', color='skyblue', edgecolor='navy')
    bars2 = ax1.bar(x + width/2, acc_times, width, label='Accelerator', color='lightcoral', edgecolor='darkred')
    
    ax1.set_xlabel('Batch Size', fontsize=12)
    ax1.set_ylabel('Inference Time (ms)', fontsize=12)
    ax1.set_title('Average Inference Time per Batch', fontsize=14)
    ax1.set_xticks(x)
    ax1.set_xticklabels([f'BS={bs}' for bs in batch_sizes])
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.1f}', ha='center', va='bottom', fontsize=9)
    
    # Plot 2: Throughput (bar chart)
    bars1 = ax2.bar(x - width/2, cpu_throughput, width, label='CPU', color='skyblue', edgecolor='navy')
    bars2 = ax2.bar(x + width/2, acc_throughput, width, label='Accelerator', color='lightcoral', edgecolor='darkred')
    
    ax2.set_xlabel('Batch Size', fontsize=12)
    ax2.set_ylabel('Samples/Second', fontsize=12)
    ax2.set_title('Throughput', fontsize=14)
    ax2.set_xticks(x)
    ax2.set_xticklabels([f'BS={bs}' for bs in batch_sizes])
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.0f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed summary table
    print("\n" + "="*70)
    print("DETAILED INFERENCE PERFORMANCE COMPARISON")
    print("="*70)
    print(f"{'Batch Size':<12} {'Metric':<20} {'CPU':<15} {'Accelerator':<15} {'Speedup':<10}")
    print("-"*70)
    
    for i, bs in enumerate(batch_sizes):
        print(f"{bs:<12} {'Time (ms)':<20} {cpu_times[i]:<15.2f} {acc_times[i]:<15.2f} {time_speedups[i]:<10.2f}x")
        print(f"{bs:<12} {'Samples/sec':<20} {cpu_throughput[i]:<15.0f} {acc_throughput[i]:<15.0f} {throughput_speedups[i]:<10.2f}x")
        print("-"*70)

# Main execution
print("Loading and preparing CIFAR-10 dataset...")
train_ds, val_ds, test_ds = load_and_prepare_cifar10(
    batch_size=32,
    train_fraction=0.1,
    val_fraction=0.1
)

# Print CIFAR-10 class names
classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 
           'dog', 'frog', 'horse', 'ship', 'truck']
print(f"Classes: {classes}")

# Train on CPU
print("\n" + "="*60)
print("TRAINING ON CPU")
print("="*60)
cpu_strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
cpu_model, cpu_time, cpu_samples_per_sec, cpu_val_acc = train_with_strategy(
    cpu_strategy, train_ds, val_ds, num_epochs=5
)

# Test CPU model
print("\nTesting CPU model:")
cpu_test_loss, cpu_test_acc = test_model(cpu_model, test_ds)

# Train on accelerator (GPU or TPU)
print("\n" + "="*60)
print("TRAINING ON ACCELERATOR")
print("="*60)
accelerator_model, accelerator_time, accelerator_samples_per_sec, accelerator_val_acc = train_with_strategy(
    strategy, train_ds, val_ds, num_epochs=5
)

# Test accelerator model
print("\nTesting Accelerator model:")
accelerator_test_loss, accelerator_test_acc = test_model(accelerator_model, test_ds)

# Profile inference
models_for_profiling = {
    'cpu': cpu_model,
    'accelerator': accelerator_model
}
profile_results = profile_inference(models_for_profiling, test_ds, batch_sizes=[1, 32, 64], num_iterations=100)

# Plot results
plot_inference_results(profile_results)

# Lab 4a: Hardware Benchmarking (HW Accelerators Prespective)
## Hardware for Machine Learning Course

This notebook is to benchmark different HWs from their architecture prespective.
Part-1 covers:
1. Environment setup, Model and dataset preparation
2. CPU performance benchmarking
3. GPU performance benchmarking

Part-2 covers:
1. 

The lab will explore how a neural network model perform across different hardware platforms and how they can be optimized for specific deployment scenarios.

## PART-1: LATENCY BENCHMARKING
### PART 1-1: ENVIRONMENT SETUP

First, we'll set up our environment by importing necessary libraries and checking available hardware.

In [None]:
# Include the needed libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
import torchvision
import torchvision.transforms as transforms
import time
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Check for accelerator (GPU or TPU)
accelerator_found = False
using_tpu = False

# Check for GPU
if torch.cuda.is_available():
    accelerator_found = True
    cuda_device = torch.device('cuda')
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
else:
    try:
        import torch_xla
        import torch_xla.core.xla_model as xm
        import torch_xla.distributed.parallel_loader as pl
        # Check for TPU
        cuda_device = xm.xla_device()
        using_tpu = True
        accelerator_found = True
        print(f"TPU detected: {cuda_device}")
    except ImportError:
        pass

if not accelerator_found:
    raise RuntimeError("No accelerator (GPU or TPU) detected! Please choose a GPU or TPU runtime in Colab.")


### PART 1-2: MODEL, DATASET PREPARATION, AND BUILDING HELPING FUNCTIONS
Now we'll create our model architectures and prepare the CIFAR-10 dataset for training and evaluation.

In [None]:
# CREATE A CNN MODEL
# PREPARE A SUBSET OF THE CIFAR-10 DATASET
class CNNModel(nn.Module):
    """Simple CNN model for classification."""
    def __init__(self, num_classes=10):
        super(CNNModel, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 128),  # CIFAR-10 images are 32x32, after two poolings: 8x8
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
        # Note: No softmax here as it's included in CrossEntropyLoss
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x
    
def load_and_prepare_cifar10(batch_size=32, train_fraction=0.1, val_fraction=0.1):
    """
    Load and prepare a small portion of CIFAR-10 dataset.
    
    Args:
        batch_size: Batch size for dataloaders
        train_fraction: Fraction of training data to use (0.0 to 1.0)
        val_fraction: Fraction of training data to use for validation (0.0 to 1.0)
    
    Returns:
        train_loader, val_loader, test_loader: DataLoaders for each dataset split
    """
    
    # Define transforms for data preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),  # Converts to [0, 1] range
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))  # CIFAR-10 mean and std
    ])
    
    # Download and load training dataset
    full_trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform
    )
    
    # Download and load test dataset
    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform
    )
    
    # Calculate sizes for training subset
    total_train = len(full_trainset)
    train_size = int(total_train * train_fraction)
    val_size = int(total_train * val_fraction)
    
    # Create indices for random subset
    indices = torch.randperm(total_train)
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    
    # Create subset datasets
    trainset = Subset(full_trainset, train_indices)
    valset = Subset(full_trainset, val_indices)
    
    # Create data loaders
    train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=2)
    test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    print(f"Training samples: {len(trainset)}")
    print(f"Validation samples: {len(valset)}")
    print(f"Test samples: {len(testset)}")
    
    return train_loader, val_loader, test_loader


In [None]:

def train_one_epoch(model, train_loader, criterion, optimizer, device, using_tpu=False):
    """Train the model for one epoch."""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    # For TPU, we need to handle the loader differently
    if using_tpu:
        import torch_xla.distributed.parallel_loader as pl
        train_loader = pl.MpDeviceLoader(train_loader, device)
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        
        # For TPU, we need to use xm.optimizer_step
        if using_tpu:
            import torch_xla.core.xla_model as xm
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        
        # Statistics
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def train(train_loader, val_loader, criterion, device, num_epochs=5):
    # Training loop
    print(f"Using device: {device}")
    
    # Determine if we're using TPU
    using_tpu = device.type == 'xla'
    
    # Create models
    model = CNNModel(num_classes=10).to(device)
    
    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Initialize timing variables
    total_training_time = 0
    total_samples_processed = 0
    epoch_metrics = []
    
    for epoch in range(num_epochs):
        # Start timing for this epoch
        if device.type == 'cuda':
            # CUDA events for GPU timing
            start_event = torch.cuda.Event(enable_timing=True)
            end_event = torch.cuda.Event(enable_timing=True)
            start_event.record()
        elif using_tpu:
            # For TPU, use Python time
            start_time = time.time()
        else:
            # CPU timing
            start_time = time.time()
        
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device, using_tpu)
        
        # End timing for this epoch
        if device.type == 'cuda':
            end_event.record()
            torch.cuda.synchronize()
            epoch_time = start_event.elapsed_time(end_event) / 1000  # Convert to seconds
        else:
            epoch_time = time.time() - start_time
        
        # Update totals
        total_training_time += epoch_time
        total_samples_processed += len(train_loader.dataset)
        
        # Calculate samples per second for this epoch
        samples_per_second = len(train_loader.dataset) / epoch_time
        
        val_loss, val_acc = validate(model, val_loader, criterion, device, using_tpu)
        
        # Store epoch metrics
        epoch_metrics.append({
            'epoch': epoch+1,
            'train_loss': train_loss,
            'train_acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc,
            'epoch_time': epoch_time,
            'samples_per_second': samples_per_second
        })
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
        print(f'Epoch Time: {epoch_time:.2f}s, Samples/sec: {samples_per_second:.2f}')
        print('-' * 50)
        
    # Calculate overall metrics
    avg_samples_per_second = total_samples_processed / total_training_time
    final_val_acc = epoch_metrics[-1]['val_acc']  # Last epoch's validation accuracy
    
    print(f"\nOverall Training Summary:")
    print(f"Total Training Time: {total_training_time:.2f}s")
    print(f"Average Samples/second: {avg_samples_per_second:.2f}")
    print(f"Final Validation Accuracy: {final_val_acc:.4f}")
    
    return model, total_training_time, avg_samples_per_second, final_val_acc

def validate(model, val_loader, criterion, device, using_tpu=False):
    """Validate the model."""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    # For TPU, we need to handle the loader differently
    if using_tpu:
        import torch_xla.distributed.parallel_loader as pl
        val_loader = pl.MpDeviceLoader(val_loader, device)
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def test(model, test_loader, criterion, device):
    """Test the model."""
    # Determine if we're using TPU
    using_tpu = device.type == 'xla'
    
    model.eval()
    test_loss, test_acc = validate(model, test_loader, criterion, device, using_tpu)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')
    return test_loss, test_acc


### PREPARING BENCHARKING FOR BOTH THE CPU AND THE GPU

In [None]:

# Load data (using 20% of training data: 10% for training, 10% for validation)
train_loader, val_loader, test_loader = load_and_prepare_cifar10(
    batch_size=32, 
    train_fraction=0.1,  # 10% of training data for training
    val_fraction=0.1      # 10% of training data for validation
)
criterion = nn.CrossEntropyLoss()

# Print CIFAR-10 class names for reference
classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 
            'dog', 'frog', 'horse', 'ship', 'truck']
print(f"Classes: {classes}")

### PART 1-3: BENCHMARKING THE CPU TRAINING
put some words

In [None]:
# BENCHMARK THE EVALUATION ON BS 1 AND 64
# THE METRICS ARE LATENCY, SAMPLES/S, EVALUATION ACCURACY 


In [None]:
# Train the cnn model using cpu
cpu_device = torch.device('cpu')
cpu_trained_model, cpu_time, cpu_samples_per_sec, cpu_val_acc = train(
    train_loader, val_loader, criterion, cpu_device
)

# Final test
cpu_test_loss, cpu_test_acc = test(cpu_trained_model, test_loader, criterion, cpu_device)

### PART 1-4: BENCHMARKING THE ACCELERATOR TRAINING 
put some words

In [None]:
# BENCHMARK  THE EVALUATION ON BS 1 AND 64
# THE METRICS ARE LATENCY, SAMPLES/S, EVALUATION ACCURACY 

In [None]:

# Train the cnn model using accelerator (GPU or TPU)
accelerator_trained_model, accelerator_time, accelerator_samples_per_sec, accelerator_val_acc = train(
    train_loader, val_loader, criterion, cuda_device
)

# Final test
accelerator_test_loss, accelerator_test_acc = test(accelerator_trained_model, test_loader, criterion, cuda_device)


### PART 1-5: BENCHMARKING THE CPU AND THE ACCELERATOR INFERENCE
put some words

In [None]:
def profile_inference(models_dict, test_loader, batch_sizes=[1, 32, 64], num_iterations=100):
    """
    Profile inference performance on CPU and accelerator for different batch sizes.
    
    Args:
        models_dict: Dictionary with keys 'cpu' and 'accelerator' containing trained models
        test_loader: DataLoader for test dataset
        batch_sizes: List of batch sizes to profile
        num_iterations: Number of inference iterations for accurate timing
    
    Returns:
        Dictionary with profiling results
    """
    results = {
        'cpu': {},
        'accelerator': {}
    }
    
    # Get a fixed batch from test loader for consistent profiling
    data_iter = iter(test_loader)
    fixed_inputs, _ = next(data_iter)  # Ignore labels as we only need inputs for inference
    
    for device_name, model in models_dict.items():
        device = next(model.parameters()).device
        print(f"\nProfiling inference on {device_name.upper()} ({device})...")
        
        for batch_size in batch_sizes:
            # Prepare batch
            if batch_size <= len(fixed_inputs):
                # Use subset of the fixed batch if possible
                inputs = fixed_inputs[:batch_size].to(device)
            else:
                # Need to create a larger batch by repeating
                repeats = (batch_size + len(fixed_inputs) - 1) // len(fixed_inputs)
                inputs = fixed_inputs.repeat(repeats, 1, 1, 1)[:batch_size].to(device)
            
            model.eval()
            
            # Warm-up
            with torch.no_grad():
                for _ in range(10):
                    _ = model(inputs)
            
            # Timed inference
            if device.type == 'cuda':
                # Use CUDA events for GPU timing
                start_event = torch.cuda.Event(enable_timing=True)
                end_event = torch.cuda.Event(enable_timing=True)
                
                start_event.record()
                with torch.no_grad():
                    for _ in range(num_iterations):
                        _ = model(inputs)
                end_event.record()
                
                torch.cuda.synchronize()
                total_time_ms = start_event.elapsed_time(end_event)  # Returns time in milliseconds
                total_time_seconds = total_time_ms / 1000
                
            else:
                # Use time.time() for CPU and TPU
                start_time = time.time()
                with torch.no_grad():
                    for _ in range(num_iterations):
                        _ = model(inputs)
                
                # For TPU, ensure all computations are complete
                if device.type == 'xla':
                    import torch_xla.core.xla_model as xm
                    xm.mark_step()
                
                end_time = time.time()
                total_time_seconds = end_time - start_time
                total_time_ms = total_time_seconds * 1000
            
            # Calculate metrics
            avg_time_per_batch_ms = total_time_ms / num_iterations
            samples_per_second = (batch_size * num_iterations) / total_time_seconds
            
            results[device_name][batch_size] = {
                'avg_inference_time_ms': avg_time_per_batch_ms,
                'samples_per_second': samples_per_second,
                'total_time_seconds': total_time_seconds,
                'num_iterations': num_iterations
            }
            
            print(f"  Batch size {batch_size}:")
            print(f"    Avg inference time: {avg_time_per_batch_ms:.2f} ms")
            print(f"    Samples/second: {samples_per_second:.2f}")
    
    # Print comparison summary
    print("\n" + "="*60)
    print("INFERENCE PROFILING SUMMARY")
    print("="*60)
    print(f"{'Batch Size':<12} {'Metric':<20} {'CPU':<15} {'Accelerator':<15} {'Speedup':<10}")
    print("-"*60)
    
    for batch_size in batch_sizes:
        cpu_time = results['cpu'][batch_size]['avg_inference_time_ms']
        acc_time = results['accelerator'][batch_size]['avg_inference_time_ms']
        speedup_time = cpu_time / acc_time if acc_time > 0 else float('inf')
        
        cpu_throughput = results['cpu'][batch_size]['samples_per_second']
        acc_throughput = results['accelerator'][batch_size]['samples_per_second']
        speedup_throughput = acc_throughput / cpu_throughput if cpu_throughput > 0 else float('inf')
        
        print(f"{batch_size:<12} {'Time (ms)':<20} {cpu_time:<15.2f} {acc_time:<15.2f} {speedup_time:<10.2f}x")
        print(f"{batch_size:<12} {'Samples/sec':<20} {cpu_throughput:<15.2f} {acc_throughput:<15.2f} {speedup_throughput:<10.2f}x")
        print("-"*60)
    
    return results

# After training both models, you can call the profiling method like this:
models_for_profiling = {
    'cpu': cpu_trained_model,
    'accelerator': accelerator_trained_model
}
profile_results = profile_inference(models_for_profiling, test_loader, batch_sizes=[1, 32, 64], num_iterations=100)

In [None]:
def plot_inference_results(profile_results):
    """
    Plot inference profiling results comparing CPU vs Accelerator.
    
    Args:
        profile_results: Dictionary returned by profile_inference() function
    """
    batch_sizes = list(profile_results['cpu'].keys())
    
    # Extract data
    cpu_times = [profile_results['cpu'][bs]['avg_inference_time_ms'] for bs in batch_sizes]
    acc_times = [profile_results['accelerator'][bs]['avg_inference_time_ms'] for bs in batch_sizes]
    
    cpu_throughput = [profile_results['cpu'][bs]['samples_per_second'] for bs in batch_sizes]
    acc_throughput = [profile_results['accelerator'][bs]['samples_per_second'] for bs in batch_sizes]
    
    # Calculate speedups
    time_speedups = [cpu_times[i] / acc_times[i] for i in range(len(batch_sizes))]
    throughput_speedups = [acc_throughput[i] / cpu_throughput[i] for i in range(len(batch_sizes))]
    
    # Create figure with subplots - FIXED: 1 row, 2 columns returns 2 axes in a 1D array
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    fig.suptitle('Inference Performance: CPU vs Accelerator', fontsize=16, fontweight='bold')
    
    # Plot 1: Inference Time (bar chart)
    ax1 = axes[0]  # FIXED: Changed from axes[0,0] to axes[0]
    x = np.arange(len(batch_sizes))
    width = 0.35
    
    bars1 = ax1.bar(x - width/2, cpu_times, width, label='CPU', color='skyblue', edgecolor='navy')
    bars2 = ax1.bar(x + width/2, acc_times, width, label='Accelerator', color='lightcoral', edgecolor='darkred')
    
    ax1.set_xlabel('Batch Size', fontsize=12)
    ax1.set_ylabel('Inference Time (ms)', fontsize=12)
    ax1.set_title('Average Inference Time per Batch', fontsize=14)
    ax1.set_xticks(x)
    ax1.set_xticklabels([f'BS={bs}' for bs in batch_sizes])
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.1f}', ha='center', va='bottom', fontsize=9)
    
    # Plot 2: Throughput (bar chart)
    ax2 = axes[1]  # FIXED: Changed from axes[0,1] to axes[1]
    
    bars1 = ax2.bar(x - width/2, cpu_throughput, width, label='CPU', color='skyblue', edgecolor='navy')
    bars2 = ax2.bar(x + width/2, acc_throughput, width, label='Accelerator', color='lightcoral', edgecolor='darkred')
    
    ax2.set_xlabel('Batch Size', fontsize=12)
    ax2.set_ylabel('Samples/Second', fontsize=12)
    ax2.set_title('Throughput', fontsize=14)
    ax2.set_xticks(x)
    ax2.set_xticklabels([f'BS={bs}' for bs in batch_sizes])
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.0f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed summary table
    print("\n" + "="*70)
    print("DETAILED INFERENCE PERFORMANCE COMPARISON")
    print("="*70)
    print(f"{'Batch Size':<12} {'Metric':<20} {'CPU':<15} {'Accelerator':<15} {'Speedup':<10}")
    print("-"*70)
    
    for i, bs in enumerate(batch_sizes):
        print(f"{bs:<12} {'Time (ms)':<20} {cpu_times[i]:<15.2f} {acc_times[i]:<15.2f} {time_speedups[i]:<10.2f}x")
        print(f"{bs:<12} {'Samples/sec':<20} {cpu_throughput[i]:<15.0f} {acc_throughput[i]:<15.0f} {throughput_speedups[i]:<10.2f}x")
        print("-"*70)


# After running profile_inference():
# profile_results = profile_inference(models_for_profiling, test_loader, batch_sizes=[1, 32, 64], num_iterations=100)
plot_inference_results(profile_results)

## Part-2: MODEL QUANTIZATION AND PRUNING
In this part we will quantize and prune a model then benchmark its performance

### PART 2-1: BUILDING HELPING FUNCTIONS


In [None]:
build one helping function for INT8 quantization and another for pruning.
Do you retrain?? will you use the same old model? will you construct a new one?

### PART 2-2: BENCHMARKING THE GPU WITH INT8 QUANTIZATION

In [None]:
REPEAT THE ACCURACY AND INFERNECE BENCHMARKING

### PART 2-3: BENCHMARKING THE GPU WITH STRUCTURAL PRUNING

In [None]:
REPEAT THE ACCURACY AND INFERNECE BENCHMARKING

## Part-3: DEPLOYMENT FORMAT CONVERSION
In this section, we'll convert our models to different formats suitable for various deployment scenarios, such as ONNX for cross-platform compatibility, SavedModel for TensorFlow Serving, and TensorFlow.js for web deployment.


WILL WE CONSIDER TENSORFLOW OR PYTORCH???