# 04: CNN Baseline for CIFAR-10 from Scratch

Deep learning paper implementation from scratch using PyTorch.
1. Data Loading with Augmentations


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import time
import copy

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


## 1. Data Pipeline with Augmentations

CIFAR-10 consists of 60,000 32x32 color images in 10 classes:
- airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck

We use standard augmentations:
- Random horizontal flip
- Random crop with padding
- Normalization with CIFAR-10 statistics

In [None]:
# CIFAR-10 normalization values
CIFAR10_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR10_STD = (0.2470, 0.2435, 0.2616)

# Training transforms with augmentation
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD),
])

# Test transforms (no augmentation)
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD),
])

# Load datasets
train_dataset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=train_transform
)
test_dataset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=test_transform
)

# Create dataloaders
BATCH_SIZE = 128

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True
)

# Class names
CLASSES = ('airplane', 'automobile', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Training batches per epoch: {len(train_loader)}")

In [None]:
# Visualize some training samples
def imshow(img, title=None):
    img = img.numpy().transpose((1, 2, 0))
    mean = np.array(CIFAR10_MEAN)
    std = np.array(CIFAR10_STD)
    img = std * img + mean
    img = np.clip(img, 0, 1)
    plt.imshow(img)
    if title:
        plt.title(title)
    plt.axis('off')

# Get a batch
images, labels = next(iter(train_loader))

# Plot
fig, axes = plt.subplots(2, 8, figsize=(14, 4))
for i, ax in enumerate(axes.flat):
    plt.sca(ax)
    imshow(images[i])
    ax.set_title(CLASSES[labels[i]], fontsize=9)
plt.suptitle('Sample Training Images (with augmentation)', fontsize=12)
plt.tight_layout()
plt.show()

## 2. CNN Architecture

A simple but effective CNN architecture:
- 3 convolutional blocks, each with Conv -> BatchNorm -> ReLU -> MaxPool
- Increasing channels: 32 -> 64 -> 128
- Global average pooling before classifier
- Optional dropout for regularization

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, pool: bool = True):
        super().__init__()
        layers = [
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        ]
        if pool:
            layers.append(nn.MaxPool2d(2, 2))
        self.block = nn.Sequential(*layers)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.block(x)


class SimpleCNN(nn.Module):
    def __init__(self, num_classes: int = 10, dropout: float = 0.0):
        super().__init__()
        self.features = nn.Sequential(
            ConvBlock(3, 32, pool=True),      # 32x16x16
            ConvBlock(32, 64, pool=True),     # 64x8x8
            ConvBlock(64, 128, pool=True),    # 128x4x4
            ConvBlock(128, 256, pool=False),  # 256x4x4
        )
        
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
        self.classifier = nn.Linear(256, num_classes)
        
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.classifier(x)
        return x


# Create model and inspect
model = SimpleCNN(num_classes=10, dropout=0.0).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Test forward pass
test_input = torch.randn(2, 3, 32, 32).to(device)
test_output = model(test_input)
print(f"\nInput shape: {test_input.shape}")
print(f"Output shape: {test_output.shape}")

## 3. Training Loop with Checkpointing

In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = 100. * correct / total
    return epoch_loss, epoch_acc


def evaluate(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    test_loss = running_loss / total
    test_acc = 100. * correct / total
    return test_loss, test_acc


def train_model(model, train_loader, test_loader, epochs, lr, device, verbose=True):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    
    history = {
        'train_loss': [], 'train_acc': [],
        'test_loss': [], 'test_acc': [],
        'lr': []
    }
    
    best_acc = 0.0
    best_model_state = None
    
    start_time = time.time()
    
    for epoch in range(epochs):
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        
        test_loss, test_acc = evaluate(model, test_loader, criterion, device)
        
        # Get current LR
        current_lr = optimizer.param_groups[0]['lr']
        
        # Update scheduler
        scheduler.step()
        
        # Record history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['test_loss'].append(test_loss)
        history['test_acc'].append(test_acc)
        history['lr'].append(current_lr)
        
        if test_acc > best_acc:
            best_acc = test_acc
            best_model_state = copy.deepcopy(model.state_dict())
        
        if verbose and (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1:3d}/{epochs} | "
                  f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.2f}% | "
                  f"Test Loss: {test_loss:.4f}, Acc: {test_acc:.2f}% | "
                  f"LR: {current_lr:.6f}")
    
    total_time = time.time() - start_time
    
    if verbose:
        print(f"\nTraining complete in {total_time:.1f}s")
        print(f"Best test accuracy: {best_acc:.2f}%")
    
    return history, best_model_state, best_acc


In [None]:
# Train the model
NUM_EPOCHS = 50
LEARNING_RATE = 0.1

print(f"Training SimpleCNN for {NUM_EPOCHS} epochs...")
print("="*70)

model = SimpleCNN(num_classes=10, dropout=0.0).to(device)
history, best_state, best_acc = train_model(
    model, train_loader, test_loader, 
    epochs=NUM_EPOCHS, lr=LEARNING_RATE, device=device
)

In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Loss
axes[0].plot(history['train_loss'], label='Train')
axes[0].plot(history['test_loss'], label='Test')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Loss Curves')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(history['train_acc'], label='Train')
axes[1].plot(history['test_acc'], label='Test')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy (%)')
axes[1].set_title('Accuracy Curves')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Learning rate
axes[2].plot(history['lr'])
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('Learning Rate')
axes[2].set_title('Learning Rate Schedule (Cosine Annealing)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Evaluation on Test Set

In [None]:
# Load best model and evaluate
model.load_state_dict(best_state)
criterion = nn.CrossEntropyLoss()

test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"Final Test Results (Best Model):")
print(f"  Loss: {test_loss:.4f}")
print(f"  Accuracy: {test_acc:.2f}%")

In [None]:
# Per-class accuracy
def get_class_accuracy(model, test_loader, device, num_classes=10):
    model.eval()
    class_correct = [0] * num_classes
    class_total = [0] * num_classes
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            
            for i in range(targets.size(0)):
                label = targets[i].item()
                class_total[label] += 1
                if predicted[i] == label:
                    class_correct[label] += 1
    
    return {CLASSES[i]: 100 * class_correct[i] / class_total[i] for i in range(num_classes)}

class_acc = get_class_accuracy(model, test_loader, device)

print("\nPer-class accuracy:")
print("-" * 30)
for cls, acc in sorted(class_acc.items(), key=lambda x: x[1], reverse=True):
    print(f"  {cls:12s}: {acc:.1f}%")

In [None]:
# Visualize some predictions
model.eval()
images, labels = next(iter(test_loader))
images, labels = images.to(device), labels.to(device)

with torch.no_grad():
    outputs = model(images)
    _, preds = outputs.max(1)

# Plot
fig, axes = plt.subplots(2, 8, figsize=(14, 4))
for i, ax in enumerate(axes.flat):
    plt.sca(ax)
    img = images[i].cpu()
    imshow(img)
    color = 'green' if preds[i] == labels[i] else 'red'
    ax.set_title(f'{CLASSES[preds[i]]}', fontsize=9, color=color)
plt.suptitle('Predictions (green=correct, red=wrong)', fontsize=12)
plt.tight_layout()
plt.show()

## 5. Ablation: Dropout ON vs OFF

Compare model performance with and without dropout regularization.

In [None]:
def run_ablation(dropout_rate, epochs=50, lr=0.1):
    model = SimpleCNN(num_classes=10, dropout=dropout_rate).to(device)
    history, best_state, best_acc = train_model(
        model, train_loader, test_loader,
        epochs=epochs, lr=lr, device=device, verbose=False
    )
    
    # Load best and get final test accuracy
    model.load_state_dict(best_state)
    _, final_acc = evaluate(model, test_loader, nn.CrossEntropyLoss(), device)
    
    return {
        'dropout': dropout_rate,
        'best_acc': best_acc,
        'final_acc': final_acc,
        'history': history
    }

print("Running ablation study: Dropout ON vs OFF")
print("="*50)

# Dropout OFF
print("\nTraining with dropout=0.0...")
results_no_dropout = run_ablation(dropout_rate=0.0, epochs=50)
print(f"  Best accuracy: {results_no_dropout['best_acc']:.2f}%")

# Dropout ON
print("\nTraining with dropout=0.5...")
results_dropout = run_ablation(dropout_rate=0.5, epochs=50)
print(f"  Best accuracy: {results_dropout['best_acc']:.2f}%")

In [None]:
# Ablation results table
print("\n" + "="*60)
print("ABLATION STUDY: Dropout Comparison")
print("="*60)
print(f"{'Configuration':<20} {'Best Test Acc':<20} {'Final Test Acc':<20}")
print("-"*60)
print(f"{'Dropout=0.0 (OFF)':<20} {results_no_dropout['best_acc']:>15.2f}% {results_no_dropout['final_acc']:>15.2f}%")
print(f"{'Dropout=0.5 (ON)':<20} {results_dropout['best_acc']:>15.2f}% {results_dropout['final_acc']:>15.2f}%")
print("="*60)

diff = results_dropout['best_acc'] - results_no_dropout['best_acc']
winner = "Dropout ON" if diff > 0 else "Dropout OFF"
print(f"\nDifference: {abs(diff):.2f}% ({'better' if diff > 0 else 'worse'} with dropout)")
print(f"Winner: {winner}")

In [None]:
# Plot ablation comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Test accuracy curves
axes[0].plot(results_no_dropout['history']['test_acc'], label='Dropout=0.0', alpha=0.8)
axes[0].plot(results_dropout['history']['test_acc'], label='Dropout=0.5', alpha=0.8)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Test Accuracy (%)')
axes[0].set_title('Test Accuracy: Dropout Comparison')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Train-Test gap (generalization)
gap_no_dropout = [t - v for t, v in zip(results_no_dropout['history']['train_acc'], 
                                         results_no_dropout['history']['test_acc'])]
gap_dropout = [t - v for t, v in zip(results_dropout['history']['train_acc'], 
                                      results_dropout['history']['test_acc'])]

axes[1].plot(gap_no_dropout, label='Dropout=0.0', alpha=0.8)
axes[1].plot(gap_dropout, label='Dropout=0.5', alpha=0.8)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Train - Test Accuracy (%)')
axes[1].set_title('Generalization Gap (lower is better)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nFinal generalization gap:")
print(f"  Dropout=0.0: {gap_no_dropout[-1]:.2f}%")
print(f"  Dropout=0.5: {gap_dropout[-1]:.2f}%")