In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import random_split, Subset, DataLoader
from datetime import datetime
import numpy as np



# Getting the same results with train and train_manual_update
- Write torch.manual_seed(42) at the beginning of your notebook.
- Write torch.set_default_dtype(torch.double) at the beginning of your notebook to alleviate precision errors

In [2]:
torch.manual_seed(42)
torch.set_default_dtype(torch.double)
# Set device for the training
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print(f"Using device: {device}")

Using device: cuda


In [3]:
class TransformedSubset(torch.utils.data.Dataset):
    def __init__(self, subset, target_transform=None):
        self.subset = subset
        self.target_transform = target_transform

    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.target_transform is not None:
            y = self.target_transform(y)
        return x, y
    
    def __len__(self):  # For geetting define length
        return len(self.subset)

# Tasks
Load, analyse and preprocess the CIFAR-10 dataset. Split it into 3
datasets: training, validation and test. Take a subset of these datasets
by keeping only 2 labels: cat and car

In [4]:
def load_cifar(train_val_split=0.9, data_path='./data', preprocessor=None):
    if preprocessor is None:
        preprocessor = transforms.Compose([
            transforms.Resize(16), # Resize images to 16x16
            transforms.ToTensor(), # Convert to tensor[0, 1]
            transforms.Normalize(  # Normalize to[-1, 1]
                mean=[0.5, 0.5, 0.5], 
                std=[0.5, 0.5, 0.5]
            )
        ])
    
    # Load training dataset
    train_dataset = datasets.CIFAR10(data_path, train=True, download=True, transform=preprocessor)
    car_cat_indices = [i for i, label in enumerate(train_dataset.targets) if label in (1, 3)]
    filtered_subset = Subset(train_dataset, car_cat_indices)
    filtered_train = TransformedSubset(filtered_subset, target_transform=lambda y: 0 if y == 1 else 1)
    
    # Split into training and validation
    train_size = int(len(filtered_train) * train_val_split)
    val_size = len(filtered_train) - train_size
    train_set, val_set = random_split(filtered_train, [train_size, val_size])
    
    # Load test dataset
    test_dataset = datasets.CIFAR10(data_path, train=False, download=True, transform=preprocessor)
    test_indices = [i for i, label in enumerate(test_dataset.targets) if label in (1, 3)]
    test_subset = Subset(test_dataset, test_indices)
    test_set = TransformedSubset(test_subset, target_transform=lambda y: 0 if y == 1 else 1)
    
    return train_set, val_set, test_set

# Compute accuracy
def compute_accuracy(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

Write a MyMLP class that implements a MLP in PyTorch (so only fully
connected layers) such that:
    
    - The input dimension is 768(= 16 ∗ 16 ∗ 3) and the output dimension is 2 (for the 2 classes).
    - The hidden layers have respectively 128 and 32 hidden units.
    - All activation functions are ReLU. The last layer has no activation function since the cross-entropy loss already includes a softmax activation
function.

In [5]:
class MyNet(nn.Module):
    """
    MLP with architecture: 768 -> 128 -> 32 -> 2
    Using ReLU activation functions between layers
    """
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(16*16*3, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, 2)
    
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    

Write a train(n_epochs, optimizer, model, loss_fn, train_loader) function that trains model for n_epochs epochs given an optimizer optimizer, a loss function loss_fn and a dataloader train_loader.

In [6]:
def train(n_epochs: int, optimizer: optim.Optimizer, model: nn.Module, loss_fn, train_loader: DataLoader):
    """
    Trains the model for n_epochs using the given optimizer and loss function.
    This function only performs training, without validation.
    """
    print(f"Trener {model.__class__.__name__} med optimizer")
    model.to(device)  # Usin model with device
    model.train()
    losses = []

    for epoch in range(n_epochs):
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)

            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())  

    return losses

Write a similar function train manual_update that has no optimizer parameter, but a learning rate lr parameter instead and that manually updates each trainable parameter of model using equation (2). Do not forget to zero out all gradients after each iteration. 

Train 2 instances of MyMLP, one using train and the other using train_manual_update (use the same parameter values for both models). Compare their respective training losses. To get exactly the same results with both functions, see section 3.3

In [None]:
def train_manual_update(n_epochs, model, loss_fn, train_loader, lr=1e-2, momentum_coeff=0., weight_decay=0.):
    """
    Train the model using manual parameter updates
    """
    model.train()
    losses = []
    velocities = {name: torch.zeros_like(param.data) for name, param in model.named_parameters() if param.requires_grad}
    # Determine device
    device = next(model.parameters()).device
    for epoch in range(n_epochs):
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            model.zero_grad() # Zero out the gradients from the previous iteration
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            losses.append(loss.item())
            
            # Manual parameter update with momentum and weight decay
            with torch.no_grad():
                for name, param in model.named_parameters():
                    if param.grad is None:
                        continue
                    grad = param.grad.data
                    
                    # Apply weight decay (L2 regularization) if specified
                    if weight_decay != 0:
                        grad = grad + weight_decay * param.data
                    velocity = velocities[name]
                    
                    # Momentum adds the gradient to the velocity
                    velocity = momentum_coeff * velocity + grad
                    velocities[name] = velocity
                    
                    # Update the parameter using the learning rate and velocity
                    param.data -= lr * velocity
    return losses

Sammenlign de forkjellige resultatene 

In [8]:
if __name__ == "__main__":
    train_set, val_set, test_set = load_cifar()
    train_loader = DataLoader(train_set, batch_size=64, shuffle=False, num_workers=0) # Num_workers = 0 for not in jupyter?
    
    # Initialize models
    torch.manual_seed(42)
    model1 = MyNet().to(device)
    torch.manual_seed(42)
    model2 = MyNet().to(device)
    
    # Loss function
    loss_fn = nn.CrossEntropyLoss()
    
    # optimizer for model1
    optimizer = optim.SGD(model1.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001)
    
    # debugging: Check DataLoader
    for data, target in train_loader:
        print("Batch data shape:", data.shape)
        print("Batch target shape:", target.shape)
        break
    
    # Train both models
    losses1 = train(10, optimizer, model1, loss_fn, train_loader)
    losses2 = train_manual_update(10, model2, loss_fn, train_loader, lr=0.01, momentum_coeff=0.9, weight_decay=0.0001)
    
    # Compare losses
    print("Training Losses (SGD):", losses1[:5])
    print("Training Losses (Manual):", losses2[:5])

Files already downloaded and verified
Files already downloaded and verified
Batch data shape: torch.Size([64, 3, 16, 16])
Batch target shape: torch.Size([64])
Trener MyNet med optimizer
Training Losses (SGD): [0.6893719391283892, 0.6967328785339857, 0.6921914262284528, 0.6950202646453515, 0.6944208211988707]
Training Losses (Manual): [0.6893719391283892, 0.6967328785339857, 0.6921914262284528, 0.6950202646453515, 0.6944208211988707]


# Comparing
Got Same result for both! Which is what we wanted.
Training Losses (SGD) = Training Losses (Manual)

# Hyperparameters
Experiment with a wider range of hyperparameters (e.g., learning rates, momentum values, weight decay) to find the best-performing model.

In [9]:
# Define hyperparameter grid
learning_rates = [0.01, 0.001, 0.0001]
momentum_values = [0.9, 0.95, 0.99]
weight_decay_values = [0.0001, 0.001, 0.01]

# Initialize variables to track the best model
best_accuracy = 0
best_hyperparams = None
best_model = None

# Create validation and training DataLoader
val_loader = DataLoader(val_set, batch_size=64, shuffle=False, num_workers=0)
train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=0)

# Iterate over all combinations of hyperparameters using nested loops
for lr in learning_rates:
    for momentum in momentum_values:
        for weight_decay in weight_decay_values:
            print(f"Training with lr={lr}, momentum={momentum}, weight_decay={weight_decay}")
            
            # Initialize model
            torch.manual_seed(42)
            model = MyNet().to(device)
            
            # Loss function
            loss_fn = nn.CrossEntropyLoss()
            
            # Train model using train_manual_update
            train_manual_update(
                n_epochs=10,
                model=model,
                loss_fn=loss_fn,
                train_loader=train_loader,
                lr=lr,
                momentum_coeff=momentum,
                weight_decay=weight_decay
            )
            
            # Evaluate on training set
            train_accuracy = compute_accuracy(model, train_loader)
            print(f"Training accuracy: {train_accuracy:.4f}")
            
            # Evaluate on validation set
            val_accuracy = compute_accuracy(model, val_loader)
            print(f"Validation accuracy: {val_accuracy:.4f}")
            
            # Update best model
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_hyperparams = (lr, momentum, weight_decay)
                best_model = model

# Print best hyperparameters and accuracy
print("\nBest hyperparameters:")
print(f"Best Learning rate: {best_hyperparams[0]}")
print(f"Best Momentum: {best_hyperparams[1]}")
print(f"Best Weight decay: {best_hyperparams[2]}")
print(f"Best validation accuracy: {best_accuracy:.4f}")

# Evaluate the best model on the training set to check for overfitting
train_accuracy = compute_accuracy(best_model, train_loader)
print(f"Training accuracy of the best model: {train_accuracy:.4f}")

# Evaluate the best model on the test set
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=0)
test_accuracy = compute_accuracy(best_model, test_loader)
print(f"Test accuracy of the best model: {test_accuracy:.4f}")

Training with lr=0.01, momentum=0.9, weight_decay=0.0001
Training accuracy: 0.9656
Validation accuracy: 0.9020
Training with lr=0.01, momentum=0.9, weight_decay=0.001
Training accuracy: 0.9628
Validation accuracy: 0.9100
Training with lr=0.01, momentum=0.9, weight_decay=0.01
Training accuracy: 0.9441
Validation accuracy: 0.9030
Training with lr=0.01, momentum=0.95, weight_decay=0.0001
Training accuracy: 0.9634
Validation accuracy: 0.9090
Training with lr=0.01, momentum=0.95, weight_decay=0.001
Training accuracy: 0.9599
Validation accuracy: 0.9060
Training with lr=0.01, momentum=0.95, weight_decay=0.01
Training accuracy: 0.9252
Validation accuracy: 0.8950
Training with lr=0.01, momentum=0.99, weight_decay=0.0001
Training accuracy: 0.9374
Validation accuracy: 0.9040
Training with lr=0.01, momentum=0.99, weight_decay=0.001
Training accuracy: 0.9377
Validation accuracy: 0.9010
Training with lr=0.01, momentum=0.99, weight_decay=0.01
Training accuracy: 0.8620
Validation accuracy: 0.8780
Trai