# Lab 4a: Hardware Benchmarking (HW Accelerators Prespective)
## Hardware for Machine Learning Course

This notebook is to benchmark different HWs from their architecture prespective.
Part-1 covers:
1. Environment setup, Model and dataset preparation
2. CPU performance benchmarking
3. GPU performance benchmarking

Part-2 covers:
1. 

The lab will explore how a neural network model perform across different hardware platforms and how they can be optimized for specific deployment scenarios.

## PART-1: LATENCY BENCHMARKING
### PART 1-1: ENVIRONMENT SETUP

First, we'll set up our environment by importing necessary libraries and checking available hardware.

In [None]:
# Include the needed libraries
# assert if cuda was not chosen
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
import torchvision
import torchvision.transforms as transforms
import time

### PART 1-2: MODEL, DATASET PREPARATION, AND BUILDING HELPING FUNCTIONS
Now we'll create our model architectures and prepare the CIFAR-10 dataset for training and evaluation.

In [None]:
# CREATE A CNN MODEL
# PREPARE A SUBSET OF THE CIFAR-10 DATASET
class CNNModel(nn.Module):
    """Simple CNN model for classification."""
    def __init__(self, num_classes=10):
        super(CNNModel, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 128),  # CIFAR-10 images are 32x32, after two poolings: 8x8
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
        # Note: No softmax here as it's included in CrossEntropyLoss
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x
    
def load_and_prepare_cifar10(batch_size=32, train_fraction=0.1, val_fraction=0.1):
    """
    Load and prepare a small portion of CIFAR-10 dataset.
    
    Args:
        batch_size: Batch size for dataloaders
        train_fraction: Fraction of training data to use (0.0 to 1.0)
        val_fraction: Fraction of training data to use for validation (0.0 to 1.0)
    
    Returns:
        train_loader, val_loader, test_loader: DataLoaders for each dataset split
    """
    
    # Define transforms for data preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),  # Converts to [0, 1] range
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))  # CIFAR-10 mean and std
    ])
    
    # Download and load training dataset
    full_trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform
    )
    
    # Download and load test dataset
    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform
    )
    
    # Calculate sizes for training subset
    total_train = len(full_trainset)
    train_size = int(total_train * train_fraction)
    val_size = int(total_train * val_fraction)
    
    # Create indices for random subset
    indices = torch.randperm(total_train)
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    
    # Create subset datasets
    trainset = Subset(full_trainset, train_indices)
    valset = Subset(full_trainset, val_indices)
    
    # Create data loaders
    train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=2)
    test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    print(f"Training samples: {len(trainset)}")
    print(f"Validation samples: {len(valset)}")
    print(f"Test samples: {len(testset)}")
    
    return train_loader, val_loader, test_loader

In [None]:
def train_one_epoch(model, train_loader, criterion, optimizer, device):
    """Train the model for one epoch."""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Statistics
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def train(train_loader, val_loader, criterion, device, num_epochs = 5):
    # Training loop
    print(f"Using device: {device}")
    # Create models
    model = CNNModel(num_classes=10).to(device)
    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Initialize timing variables
    total_training_time = 0
    total_samples_processed = 0
    epoch_metrics = []
    
    for epoch in range(num_epochs):
        # Start timing for this epoch
        if device.type == 'cuda':
            # CUDA events for GPU timing
            start_event = torch.cuda.Event(enable_timing=True)
            end_event = torch.cuda.Event(enable_timing=True)
            start_event.record()
        else:
            # CPU timing
            start_time = time.time()
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        # End timing for this epoch
        if device.type == 'cuda':
            end_event.record()
            torch.cuda.synchronize()
            epoch_time = start_event.elapsed_time(end_event) / 1000  # Convert to seconds
        else:
            epoch_time = time.time() - start_time
        # Update totals
        total_training_time += epoch_time
        total_samples_processed += len(train_loader.dataset)
        # Calculate samples per second for this epoch
        samples_per_second = len(train_loader.dataset) / epoch_time
        
        val_loss, val_acc = validate(model, val_loader, criterion, device)
        
        # Store epoch metrics
        epoch_metrics.append({
            'epoch': epoch+1,
            'train_loss': train_loss,
            'train_acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc,
            'epoch_time': epoch_time,
            'samples_per_second': samples_per_second
        })
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
        print(f'Epoch Time: {epoch_time:.2f}s, Samples/sec: {samples_per_second:.2f}')
        print('-' * 50)
        
    # Calculate overall metrics
    avg_samples_per_second = total_samples_processed / total_training_time
    final_val_acc = epoch_metrics[-1]['val_acc']  # Last epoch's validation accuracy
    
    print(f"\nOverall Training Summary:")
    print(f"Total Training Time: {total_training_time:.2f}s")
    print(f"Average Samples/second: {avg_samples_per_second:.2f}")
    print(f"Final Validation Accuracy: {final_val_acc:.4f}")
    
    return model, total_training_time, avg_samples_per_second, final_val_acc

def validate(model, val_loader, criterion, device):
    """Validate the model."""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def test(model, test_loader, criterion, device):
    """Test the model."""
    model.eval()
    test_loss, test_acc = validate(model, test_loader, criterion, device)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')
    return test_loss, test_acc

### PREPARING BENCHARKING FOR BOTH THE CPU AND THE GPU

In [None]:
assert torch.cuda.is_available(), "This notebook should not be run with a runtime other than a GPU"
cuda_device = torch.device('cuda')
cpu_device = torch.device('cpu')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load data (using 20% of training data: 10% for training, 10% for validation)
train_loader, val_loader, test_loader = load_and_prepare_cifar10(
    batch_size=32, 
    train_fraction=0.1,  # 10% of training data for training
    val_fraction=0.1      # 10% of training data for validation
)
criterion = nn.CrossEntropyLoss()

# Print CIFAR-10 class names for reference
classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 
            'dog', 'frog', 'horse', 'ship', 'truck']
print(f"Classes: {classes}")

### PART 1-3: BENCHMARKING THE CPU
put some words

In [None]:
BENCHMARK THE EVALUATION ON BS 1 AND 64
THE METRICS ARE LATENCY, SAMPLES/S, EVALUATION ACCURACY 


In [None]:
# Train the cnn model using cpu
cpu_trained_model, cpu_time, cpu_samples_per_sec, cpu_val_acc = train(train_loader, val_loader, criterion, cpu_device)
# Final test
cpu_test_loss, cpu_test_acc = test(cpu_trained_model, test_loader, criterion, cpu_device)

### PART 1-4: BENCHMARKING THE GPU
put some words

In [None]:
BENCHMARK  THE EVALUATION ON BS 1 AND 64
THE METRICS ARE LATENCY, SAMPLES/S, EVALUATION ACCURACY 

In [None]:
# Train the cnn model using gpu
cuda_trained_model, cuda_time, cuda_samples_per_sec, cuda_val_acc = train(train_loader, val_loader, criterion, cuda_device)
# Final test
cuda_test_loss, cuda_test_acc = test(cuda_trained_model, test_loader, criterion, cuda_device)

## Part-2: MODEL QUANTIZATION AND PRUNING
In this part we will quantize and prune a model then benchmark its performance

### PART 2-1: BUILDING HELPING FUNCTIONS


In [None]:
build one helping function for INT8 quantization and another for pruning.
Do you retrain?? will you use the same old model? will you construct a new one?

### PART 2-2: BENCHMARKING THE GPU WITH INT8 QUANTIZATION

In [None]:
REPEAT THE ACCURACY AND INFERNECE BENCHMARKING

### PART 2-3: BENCHMARKING THE GPU WITH STRUCTURAL PRUNING

In [None]:
REPEAT THE ACCURACY AND INFERNECE BENCHMARKING

## Part-3: DEPLOYMENT FORMAT CONVERSION
In this section, we'll convert our models to different formats suitable for various deployment scenarios, such as ONNX for cross-platform compatibility, SavedModel for TensorFlow Serving, and TensorFlow.js for web deployment.


WILL WE CONSIDER TENSORFLOW OR PYTORCH???