# CIFAR 10 image recognition

# Imports


In [1]:
import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

# Getting and preparing data

In [None]:
# Separate transforms for training (with augmentation) and test/validation (no augmentation)
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),                                   # Random crop with padding (32x32 images)
    transforms.RandomHorizontalFlip(),                                      # 50% chance of horizontal flip
    transforms.ToTensor(),                                                  # scaling to [0, 1]
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))                  # scaling to [-1, 1]
])

test_transform = transforms.Compose([
    transforms.ToTensor(),                                                  # scaling to [0, 1]
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))                  # scaling to [-1, 1]
])

In [None]:
# Downloads data if not already present, and loads it as a PyTorch dataset
full_trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=None)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=None)

# Split training set into train and validation (80-20) randomly (every class should be represented in both sets due to large amount of data)
train_size = int(0.8 * len(full_trainset))
val_size = len(full_trainset) - train_size
trainset, valset = torch.utils.data.random_split(full_trainset, [train_size, val_size])

print(f"Training set size: {train_size}")
print(f"Validation set size: {val_size}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 40000
Validation set size: 10000
Test set size: 10000


In [None]:
# Augment train and validation sets with the appropriate transforms (train gets augmentation, val gets no augmentation)
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=train_transform)
val_dataset = torchvision.datasets.CIFAR10(root='./data',train=True,download=False, transform=test_transform)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=False, transform=test_transform)

trainset = torch.utils.data.Subset(train_dataset, trainset.indices)
valset = torch.utils.data.Subset(val_dataset, valset.indices)

In [5]:
classes = train_dataset.classes
img, label = train_dataset[0]
print("Classes:", classes)
print()
print(img.shape)
print()
print(torch.min(img), torch.max(img))
print()
print(img)

Classes: ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

torch.Size([3, 32, 32])

tensor(-1.) tensor(1.)

tensor([[[-0.0588, -0.0745,  0.0196,  ..., -0.8745, -0.8039, -1.0000],
         [-0.0510, -0.0196,  0.0196,  ..., -0.7020, -0.7412, -1.0000],
         [-0.4118, -0.3176, -0.1922,  ..., -0.5373, -0.6078, -1.0000],
         ...,
         [ 0.1843,  0.6941,  0.3333,  ...,  0.3176,  0.3882, -1.0000],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]],

        [[-0.3412, -0.3412, -0.2627,  ..., -0.9451, -0.8118, -1.0000],
         [-0.3333, -0.3176, -0.2941,  ..., -0.8431, -0.8039, -1.0000],
         [-0.6314, -0.5529, -0.4745,  ..., -0.7490, -0.7490, -1.0000],
         ...,
         [-0.0745,  0.4431,  0.0980,  ...,  0.0118,  0.1294, -1.0000],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000,  .

## Definition of the model

In [6]:
# Definition of a convolutional neural network for image classification
# The network must have 3 input neurons, corresponding to the 3 color channels (RGB) of the input images
# The network must have 10 output neurons, corresponding to the 10 classes in the CIFAR-10 dataset

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 24, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(24, 32, kernel_size=3, padding=1)

        self.pool = nn.MaxPool2d(2, 2)

        self.fc1 = nn.Linear(32 * 8 * 8, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        # conv1 -> relu -> pool -> conv2 -> relu -> pool -> conv3 -> relu -> flatten -> fc1 -> relu -> fc2
        x = self.pool(F.relu(self.conv1(x)))   # (3, 32, 32) -> (16, 16, 16)
        x = self.pool(F.relu(self.conv2(x)))   # (16, 16, 16) -> (24, 8, 8)
        x = F.relu(self.conv3(x))              # (24, 8, 8) -> (32, 8, 8)
        x = torch.flatten(x, 1)                # (batch, 2048)
        x = F.relu(self.fc1(x))                # (batch, 2048) -> (batch, 256)
        x = self.fc2(x)                        # (batch, 256) -> (batch, 10)

        return x

## Training

In [7]:
model = SimpleCNN()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 16
batch_size = 32
learning_rate = 0.001
patience = 2                # number of epochs with validation loss smaller than best_val_loss required for early stopping
 

loss_fn = nn.CrossEntropyLoss()                                         # best for multi-class classification problems
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
best_val_loss = float('inf')
counter = 0

for epoch in range(epochs):

    model.train() 
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in trainloader:
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(trainloader)
    epoch_acc = correct / total

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for images, labels in valloader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
    
    val_epoch_loss = val_loss / len(valloader)
    val_epoch_acc = val_correct / val_total

    print(f"Epoch [{epoch+1}/{epochs}]")
    print(f"Train Loss: {epoch_loss:.4f}, Train Accuracy: {epoch_acc:.4f}")
    print(f"Val Loss:   {val_epoch_loss:.4f}, Val Accuracy:   {val_epoch_acc:.4f}")
    print("-" * 50)

    if val_epoch_loss < best_val_loss:
        best_val_loss = val_epoch_loss
        counter = 0
        torch.save(model.state_dict(), "best_model.pth")
    else:
        counter += 1
    
    if counter >= patience:
        print("Early stopping triggered")
        break

Epoch [1/16]
Train Loss: 1.6253, Train Accuracy: 0.4037
Val Loss:   1.2784, Val Accuracy:   0.5316
--------------------------------------------------
Epoch [2/16]
Train Loss: 1.3056, Train Accuracy: 0.5256
Val Loss:   1.1196, Val Accuracy:   0.5949
--------------------------------------------------
Epoch [3/16]
Train Loss: 1.1672, Train Accuracy: 0.5806
Val Loss:   1.0064, Val Accuracy:   0.6360
--------------------------------------------------
Epoch [4/16]
Train Loss: 1.0682, Train Accuracy: 0.6187
Val Loss:   0.9496, Val Accuracy:   0.6638
--------------------------------------------------
Epoch [5/16]
Train Loss: 0.9962, Train Accuracy: 0.6454
Val Loss:   0.9179, Val Accuracy:   0.6738
--------------------------------------------------
Epoch [6/16]
Train Loss: 0.9509, Train Accuracy: 0.6615
Val Loss:   0.8612, Val Accuracy:   0.7024
--------------------------------------------------
Epoch [7/16]
Train Loss: 0.9084, Train Accuracy: 0.6797
Val Loss:   0.8402, Val Accuracy:   0.7072
-

## Testing on Test Set

In [11]:
model.load_state_dict(torch.load("best_model.pth", map_location=device))
model.eval()

correct = 0
total = 0
test_loss = 0.0

all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in testloader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        loss = loss_fn(outputs, labels)
        test_loss += loss.item()

        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_accuracy = correct / total
test_loss = test_loss / len(testloader)

print("TEST RESULTS")
print()
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f} ({correct}/{total})")
print()

cm = confusion_matrix(all_labels, all_preds)
cm_df = pd.DataFrame(cm, index=classes, columns=classes)

print("Confusion Matrix:")
print(cm_df)
print()

TEST RESULTS

Test Loss: 0.7445
Test Accuracy: 0.7411 (7411/10000)

Confusion Matrix:
            airplane  automobile  bird  cat  deer  dog  frog  horse  ship  \
airplane         789          39    60   10    12    0    10      5    39   
automobile        13         891     6    3     1    1     5      1     6   
bird              60          11   665   31    76   51    63     25     6   
cat               24          19    85  544    65  113    88     34     8   
deer              25           4    69   29   745   19    58     47     3   
dog               15           9    80  178    62  553    41     48     7   
frog               3           6    53   42    30   11   843      4     5   
horse             19           4    42   23    53   41     7    793     2   
ship              93          63    18    9     6    5     7      3   764   
truck             35          95     8    5     4    3     9      7    10   

            truck  
airplane       36  
automobile     73  
bird  

## Per-Class Accuracy

In [None]:
# Per-class accuracy
class_correct = {str(i): 0 for i in range(10)}
class_total = {str(i): 0 for i in range(10)}

model.eval()
with torch.no_grad():
    for images, labels in testloader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        
        for i in range(len(labels)):
            label = str(labels[i].item())
            class_total[label] += 1
            if predicted[i] == labels[i]:
                class_correct[label] += 1

print("Per-Class Accuracy:")
print("-" * 35)
for i in range(10):
    if class_total[str(i)] > 0:
        acc = 100 * class_correct[str(i)] / class_total[str(i)]
        print(f"{classes[i]:15s}: {acc:6.2f}% ({class_correct[str(i)]}/{class_total[str(i)]})")


Per-Class Accuracy:
-----------------------------------
airplane       :  78.90% (789/1000)
automobile     :  89.10% (891/1000)
bird           :  66.50% (665/1000)
cat            :  54.40% (544/1000)
deer           :  74.50% (745/1000)
dog            :  55.30% (553/1000)
frog           :  84.30% (843/1000)
horse          :  79.30% (793/1000)
ship           :  76.40% (764/1000)
truck          :  82.40% (824/1000)
