In [4]:
### TASK 1
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # Define the layers of the CNN
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1) 
        # Padding is set to 1 to maintain the same spatial dimensions after convolution
        # Kernel size is 3x3, stride is 1 to move one pixel at a time
        # The first layer takes a single-channel input (grayscale image) and outputs 32 channels
        # The output size after this layer will be (32, 28, 28) for a 28x28 input image
        
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        # Max pooling with a 2x2 kernel and stride of 2 reduces the spatial dimensions by half
        # The output size after this layer will be (32, 14, 14)
        

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=2)
        # The second layer takes 32 channels as input and outputs 64 channels
        # Padding is set to 2 to maintain the same spatial dimensions after convolution
        # The kernel size is 3x3, stride is 1 to move one pixel at a time
        # The output size after this layer will be (64, 14, 14)

        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        # Max pooling with a 2x2 kernel and stride of 2 reduces the spatial dimensions by half
        # The output size after this layer will be (64, 7, 7)

        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        # The third layer takes 64 channels as input and outputs 64 channels
        # Padding is set to 1 to maintain the same spatial dimensions after convolution
        # The kernel size is 3x3, stride is 1 to move one pixel at a time
        # The output size after this layer will be (64, 7, 7)

        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        # Max pooling with a 2x2 kernel and stride of 2 reduces the spatial dimensions by half
        # The output size after this layer will be (64, 3, 3)

        self.flatten = nn.Flatten()
        # Flatten the output from the convolutional layers to feed into the fully connected layers
        # The output size after flattening will be (64 * 3 * 3)

        self.fc1 = nn.Linear(64 * 4 * 4, 256)
        # The first fully connected layer takes the flattened output and outputs 256 features
        # The input size is 64 * 4 * 4 = 1024 (after flattening)
        self.fc2 = nn.Linear(256, 128)
        # The second fully connected layer takes 256 features as input and outputs 128 features
        # The input size is 256
        self.fc3 = nn.Linear(128, 10)
        # The third fully connected layer takes 128 features as input and outputs 10 features (for 10 classes)
        # The input size is 128


    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x))) # Apply the first convolutional layer and max pooling
        x = self.pool2(F.relu(self.conv2(x))) # Apply the second convolutional layer and max pooling
        x = self.pool3(F.relu(self.conv3(x))) # Apply the third convolutional layer and max pooling
        x = self.flatten(x)                   # Flatten the output
        x = F.relu(self.fc1(x))               # Apply the first fully connected layer
        x = F.relu(self.fc2(x))               # Apply the second fully connected layer
        x = self.fc3(x)                       # No softmax here because CrossEntropyLoss expects raw logits
        return x                              # The output is the raw logits for each class

In [5]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

class FashionMNISTLoader:
    def __init__(self, batch_size=64, val_split=0.0, data_dir='./data'):
        self.batch_size = batch_size
        self.val_split = val_split
        self.data_dir = data_dir

        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))
        ])

    def load_datasets(self):
        train_dataset = datasets.FashionMNIST(root=self.data_dir, train=True, download=True, transform=self.transform)
        test_dataset = datasets.FashionMNIST(root=self.data_dir, train=False, download=True, transform=self.transform)

        if self.val_split > 0:
            val_size = int(self.val_split * len(train_dataset))
            train_size = len(train_dataset) - val_size
            train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
            self.val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False)
        else:
            self.val_loader = None

        self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        self.test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)

    def get_loaders(self):
        self.load_datasets()
        return self.train_loader, self.val_loader, self.test_loader


In [None]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score  # Importing roc_auc_score for AUC calculation
import torch.nn.functional as F # Importing functional module for softmax

class Trainer:
    def __init__(self, model, train_loader, test_loader, val_loader=None, optimizer=None, loss_fn=None, device='cpu'):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.val_loader = val_loader
        self.device = device
        self.optimizer = optimizer or torch.optim.SGD(model.parameters(), lr=0.01)
        self.loss_fn = loss_fn or nn.CrossEntropyLoss()


    def train(self, epochs=10):
        """ Train the model on the training set.
        Args:
            epochs (int): Number of epochs to train the model.
        """

        self.model.train() # Set the model to training mode


        for epoch in range(epochs):
            running_loss = 0.0 # Initialize running loss for each epoch

            for batch_idx, (inputs, labels) in enumerate(self.train_loader):
                inputs = inputs.to(self.device) # Move data to the device (GPU or CPU)
                labels = labels.to(self.device) # Move target to the device (GPU or CPU)

                self.optimizer.zero_grad() # Zero the gradients before the backward pass

                outputs = self.model(inputs) # Forward pass through the model

                loss = self.loss_fn(outputs, labels) # Compute the loss

                loss.backward() # Backward pass to compute gradients

                self.optimizer.step() # Update the model parameters

                running_loss += loss.item() # Accumulate the loss

            avg_loss = running_loss / (batch_idx + 1) # Average loss for the epoch

            print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(self.train_loader)}') 
            # Print the average loss for the epoch


    def evaluate(self, loader=None):
        """ Evaluate the model on the test set or validation set.
        Args:
            loader (DataLoader): DataLoader for the test or validation set. If None, uses the test_loader.
        Returns:
            float: Accuracy of the model on the test set.
        """
        self.model.eval() # Set the model to evaluation mode

        loader = loader or self.test_loader # Use the test_loader if no loader is provided

        y_true, y_pred = [], [] # Initialize lists to store true and predicted labels

        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(self.device), labels.to(self.device) # Move data to the device (GPU or CPU)

                outputs = self.model(images) # Forward pass through the model

                preds = outputs.argmax(dim=1) # Get the predicted labels

                y_true.extend(labels.cpu().numpy()) # Move labels to CPU and convert to numpy array

                y_pred.extend(preds.cpu().numpy()) # Move predictions to CPU and convert to numpy array

        accuracy = accuracy_score(y_true, y_pred) # Compute accuracy

        return accuracy
    
    def evaluate_auc(self, loader= None, pos_class= None):
        self.model.eval() # Set the model to evaluation mode
        loader = loader or self.test_loader # Use the test_loader if no loader is provided
        pos_class = pos_class or 1 # Default positive class is 1
        y_true, y_scores = [], []

        # Iterate through the data loader
        with torch.no_grad(): 
            for images, labels in loader:
                images, labels = images.to(self.device), labels.to(self.device)
                outputs = F.softmax(self.model(images), dim=1) # Get the softmax probabilities
                y_true.extend((labels == pos_class).cpu().numpy()) # Convert labels to binary (1 for positive class, 0 for negative class)
                y_scores.extend(outputs[:, pos_class].cpu().numpy()) # Get the scores for the positive class

        auc = roc_auc_score(y_true, y_scores) # Compute AUC score

        return auc


'\n$ python Training-Testing.py \nCNN(\n  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))\n  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n  (flatten): Flatten(start_dim=1, end_dim=-1)\n  (fc1): Linear(in_features=1024, out_features=256, bias=True)\n  (fc2): Linear(in_features=256, out_features=128, bias=True)\n  (fc3): Linear(in_features=128, out_features=10, bias=True)\n)\n\nTraining with Validation Split: 0%\n100.0%\n100.0%\n100.0%\n100.0%\nEpoch 1/10, Loss: 1.843058825873617\nEpoch 2/10, Loss: 0.7006924451350658\nEpoch 3/10, Loss: 0.5781377986359444\nEpoch 4/10, Loss: 0.5174538178611666\nEpoch 

CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=1024, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)

# Task 1: CNN Architecture Implementation

The CNN model was implemented in PyTorch as specified:

- **Input:** 28x28 grayscale image
- **Conv1:** 32 filters, 3×3, stride 1, padding 1
- **MaxPool:** 2×2, stride 2
- **Conv2:** 64 filters, 3×3, stride 1, padding 2
- **MaxPool:** 2×2, stride 2
- **Conv3:** 64 filters, 3×3, stride 1, padding 1
- **MaxPool:** 2×2, stride 2
- **Fully Connected Layers:** 256 → 128 → 10
- **Output:** Logits (softmax will be applied during evaluation)

The model summary confirms the layer configuration and output shapes.


In [None]:
from CNN import CNN
from FashionMNISTLoader import FashionMNISTLoader
from TrainValTestSplit import Trainer
import torch


###################################################################################################

# Task 1
model = CNN()
print(f"Model Architecture: {model}") # Log the model architecture


# Task 2: Training and Testing with Varying Validation Splits

We trained the CNN model on FashionMNIST using the following validation split percentages: 0%, 10%, 20%, 30%, 40%.

All experiments used:
- 10 training epochs
- Batch size: 64
- Optimizer: SGD
- Learning rate: 0.1
- Loss function: CrossEntropyLoss

**Findings:**
- Without a validation set (0%), performance was decent but lacked early stopping or tuning.
- With increasing validation size, test accuracy first improved, then dropped.
- Larger validation splits reduce training data, potentially underfitting the model.

| Validation Split | Test Accuracy |
|------------------|---------------|
| 0%               | 0.8562        |
| 10%              | 0.8591        |
| 20%              | 0.8589        |
| 30%              | 0.8315        |
| 40%              | 0.8175        |


In [None]:
# Task 2
print("=== Running Task 2 Experiments ===")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Set device to GPU if available, else CPU
splits = [0.0, 0.1, 0.2, 0.3, 0.4] # Validation splits to test
results = []

for split in splits:
    print(f"\nTraining with Validation Split: {int(split*100)}%")
    
    loader = FashionMNISTLoader(batch_size=64, val_split=split) # Initialize the data loader with the current split

    train_loader, val_loader, test_loader = loader.get_loaders() # Get the data loaders
    print(f"DataLoaders initialized with batch size 64 and val_split {split}")

    model = CNN() # Initialize the model

    trainer = Trainer(model, train_loader, test_loader, val_loader, device=device) # Initialize the trainer

    trainer.train(epochs=10) # Train the model
    print(f"Training completed for {int(split*100)}% validation split.")

    acc = trainer.evaluate() # Evaluate the model on the test set

    print(f"Test Accuracy with {int(split*100)}% val split: {acc:.4f}")
    results.append((split, acc))


print("\n=== Summary of Results ===")
for split, acc in results:
    print(f"Val Split: {int(split*100)}% | Test Accuracy: {acc:.4f}")

# Task 3 : Learning Rate Experiments

We tested the CNN performance using the best train/validation split ratio (10%) across different learning rates:  
`[0.001, 0.01, 0.1, 1, 10]`.

**Findings:**
- Very small learning rates (0.001) led to slow convergence, underfitting the data.
- Moderate rates (0.01 and 0.1) performed best.
- Large rates (1 and 10) caused instability or divergence.

| Learning Rate | Test Accuracy |
|---------------|---------------|
| 0.001         | 0.5643        |
| 0.01          | 0.8504        |
| 0.1           | 0.9033        |
| 1             | 0.1000        |
| 10            | 0.1000        |


In [None]:
# Task 3
print("\n=== Running Task 3 Experiments ===")\

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Set device to GPU if available, else CPU

best_val_split = 0.1 # Assuming the best validation split is 0.1 based on previous results
learning_rates = [0.001, 0.01, 0.1, 1, 10] # Learning rates to test
lr_results = []

for lr in learning_rates:
    print(f"\nTraining with Learning Rate: {lr}")
    
    loader = FashionMNISTLoader(batch_size=64, val_split=best_val_split) # Initialize the data loader with the best split

    train_loader, val_loader, test_loader = loader.get_loaders() # Get the data loaders
    print(f"DataLoaders initialized with batch size 64 and val_split {best_val_split}")


    model = CNN() # Initialize the model
    optimizer = torch.optim.SGD(model.parameters(), lr=lr) # Initialize the optimizer with the current learning rate
    trainer = Trainer(model, train_loader, test_loader, val_loader, optimizer, device=device) # Initialize the trainer

    trainer.train(epochs=10) # Train the model
    print(f"Training completed for learning rate {lr}.")

    acc = trainer.evaluate() # Evaluate the model on the test set

    print(f"Test Accuracy with learning rate {lr}: {acc:.4f}")
    lr_results.append((lr, acc))

print("\n=== Summary of Learning Rate Results ===")
for lr, acc in lr_results:
    print(f"Learning Rate: {lr} | Test Accuracy: {acc:.4f}")

# Task 4: Optimizer Comparison – Adam vs SGD

Using the best learning rate (`0.1`) and validation split (`20%`) from previous experiments, we replaced the SGD optimizer with the Adam optimizer and retrained the model using the same training setup.

**Findings:**
- Adam generally shows faster convergence and slightly better generalization compared to SGD across a range of learning rates.
- Although, for this dataset and CNN architecture, Adam was performed significantly worse over SGD.
- After research, Adam benifits from a more adaptive learning rate and having such a high rate a 0.1 to start off, must have been too aggressive to start off

| Optimizer | Learning Rate | Test Accuracy |
|----------|----------------|---------------|
| SGD      | 0.1            | 0.9033        |
| Adam     | 0.1            | 0.1000        |


In [7]:
print("\n=== Running Task 4 Experiments ===")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_val_split = 0.1 # Assuming the best validation split is 0.1 based on previous results
best_learning_rate = 0.1 # Assuming the best learning rate is 0.1 based on previous results

print(f"\nTraining with Learning Rate: {best_learning_rate}")
loader = FashionMNISTLoader(batch_size=64, val_split=best_val_split)

train_loader, val_loader, test_loader = loader.get_loaders() 
print(f"DataLoaders initialized with batch size 64 and val_split {best_val_split}")

model = CNN()
optimizer = torch.optim.Adam(model.parameters(), lr=best_learning_rate) # Initialize the adam optimizer 
trainer = Trainer(model, train_loader, test_loader, val_loader, optimizer, device=device)

trainer.train(epochs=10)

adam_accuracy = trainer.evaluate()
print(f"Test Accuracy with Adam optimizer: {adam_accuracy:.4f}")

###################################################################################################



=== Running Task 4 Experiments ===

Training with Learning Rate: 0.1
DataLoaders initialized with batch size 64 and val_split 0.1
Epoch 1/10, Loss: 12.177495105995385
Epoch 2/10, Loss: 1.770245125932151
Epoch 3/10, Loss: 2.1743160456560235
Epoch 4/10, Loss: 2.312556399553308
Epoch 5/10, Loss: 2.3117938807225342
Epoch 6/10, Loss: 2.3114005962819286
Epoch 7/10, Loss: 2.312406127486749
Epoch 8/10, Loss: 2.3118031894991184
Epoch 9/10, Loss: 2.311489098727421
Epoch 10/10, Loss: 2.3124969590331705
Test Accuracy with Adam optimizer: 0.1000


# Task 5: One-Class AUC Evaluation

We evaluated the best-performing CNN model in a one-vs-all setting using label `2` (Pullover) as the positive class, and all other labels as the negative class. We calculated the Area Under the ROC Curve (AUC) as our evaluation metric.

This binary setup helps assess how well the model separates one specific class from others, beyond multi-class accuracy.

**Result:**
- AUC Score for Label 2: `0.9910`

An AUC near 1.0 indicates excellent separation; values closer to 0.5 suggest poor class distinction.


In [8]:
# Task 5: AUC Evaluation for Class 2 vs All
print("\n=== Running Task 5: One-vs-All AUC Evaluation ===")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_val_split = 0.1
best_learning_rate = 0.1

# Load data
loader = FashionMNISTLoader(batch_size=64, val_split=best_val_split)
train_loader, val_loader, test_loader = loader.get_loaders()
print(f"DataLoaders initialized with batch size 64 and val_split {best_val_split} and learning rate {best_learning_rate}")

# Initialize model and optimizer
model = CNN()
optimizer = torch.optim.SGD(model.parameters(), lr=best_learning_rate)
trainer = Trainer(model, train_loader, test_loader, val_loader, optimizer=optimizer, device=device)

# Train model
trainer.train(epochs=10)

# Evaluate AUC for class 2 (positive) vs all (negative)
auc_score = trainer.evaluate_auc(pos_class=2)
print(f"AUC Score (class 2 vs all): {auc_score:.4f}")


=== Running Task 5: One-vs-All AUC Evaluation ===
DataLoaders initialized with batch size 64 and val_split 0.1 and learning rate 0.1
Epoch 1/10, Loss: 0.8410891521527869
Epoch 2/10, Loss: 0.3872901620210912
Epoch 3/10, Loss: 0.3127447700666449
Epoch 4/10, Loss: 0.2740465437825681
Epoch 5/10, Loss: 0.24535594770234626
Epoch 6/10, Loss: 0.22276802713678207
Epoch 7/10, Loss: 0.20679125978977758
Epoch 8/10, Loss: 0.18940740286580052
Epoch 9/10, Loss: 0.17740572119016923
Epoch 10/10, Loss: 0.16434979045507608
AUC Score (class 2 vs all): 0.9901
