In [4]:
"""
We will be implementig a CNN (Convolutional neural network) and doing image classification based
on the CIFAR-10 dataset (dataset with 10 differen classes like plane, bird, dog, truck, ect.)
This dataset is also available directly in PyTorch
"""

"""
CNNs are similar to normal neural networks:
    > They are mad eup of neurons that have learnable wieghts and biases
    > The main difference is that CNNs mainly work on image data and apply the so called
      "convolutional" filters
      
A typical CNN architecture has:
    > Image
    > Different convolutional layers and activation functions followed by "pooling" layers
        - These pooling layers are used to automatically learn some features from the images
    > Then at the end we have one or more "FC" (fully connected) layers for the actual classification
      task
      
Convolutional filters work by applying filter kernel to our image (remember the "What is convolution" 
video):
    > Put the filter at first position
    > Compute output value by multiplying and summing up all the values
    > Write value into output image
    > Slide our filter to next position
    > Then do same thing with same filter operations, just keep sliding over whole image
    
With this tranform, our resulting image may have a smaller size as our filter does not fit in the 
corners or something, except if we use a technique called padding (which we will not cover in this
lecture)
    > So getting the correct size is an important step
"""

"""
- Pooling layers -

In this case we are talking about max pooling
Max pooling is used to downsample an image by applying a "max" filter to sub-regions
This reduces the computational cost by reducing the size of the image
    > reducing the number of parmeters our model has to learn
    > Also helps to avoid "overfitting" by providing an abstracted form of the input

NOTE: Downsample = to make smaller by making smaller sampling rate (look at image at 3:17)
"""

'\n- Pooling layers -\n\nIn this case we are talking about max pooling\nMax pooling is used to downsample an image by applying a "max" filter to sub-regions\nThis reduces the computational cost by reducing the size of the image\n    > reducing the number of parmeters our model has to learn\n    > Also helps to avoid "overfitting" by providing an abstracted form of the input\n\nNOTE: Downsample = to make smaller by making smaller sampling rate (look at image at 3:17)\n'

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
# Hyper-parameters

num_epochs = 4
batch_size = 4
learning_rate = 0.001

In [4]:
# The dataset has PILImages of range [0,1]
# PIL = pillow = a python image library
# We transform them to tensors of normalized range [-1, 1]
transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Get datasets
# CIFAR10 dataset in PyTorch
train_dataset = torchvision.datasets.CIFAR10(root='./data',
                                            train=True,
                                            download=True,
                                            transform=transform)

test_dataset = torchvision.datasets.CIFAR10(root='./data',
                                            train=False,
                                            download=True,
                                            transform=transform)

# Loaders for automatic batch optimization and batch training
train_loader = torch.utils.data.DataLoader(train_dataset,
                                          batch_size=batch_size,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False) # Don't need to shuffle test data

# Classes hardcoded
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [5]:
# Implement convolutional neural netowrk
# The imporant part for this tutorial

"""
The layout:
    > Input
    > Feature learning:
        - Convolution layer + ReLu activation functin
        - Max pooling
        - Repeat the above 2 (x times?), 2 times in this example
    > Classification:
        - Flatten
        - Fully connected
        - Softmax (included in cross-entropy)
"""

class ConvNet(nn.Module):
    def __init__(self):
        
        super(ConvNet, self).__init__() # This is needed for a conv net
        
        # Define layers
        # We do the activation functions in the forward pass, different to before
        self.conv1 = nn.Conv2d(3, 6, 5)
        # Parameters are (input channel size, output channel size, kernel size)
        # Input cannel size is 3 as out images have 3 color channels
        # Output channel size is 6
        # Kernel size is 5 (5x5)
        
        self.pool = nn.MaxPool2d(2, 2)
        # Parameters are (kernel size, stride)
        # Kernel size being how big the pool is i.e. in this case 2x2
        # Stride being by how much we shift
        # ^^^ I think??
        # Exactly as image in tutorial
        
        self.conv2 = nn.Conv2d(6, 16, 5)
        # Input channel size must be equal to last output channel size
        # Ouput size is 16
        # Kernel size is still 5
        
        # Now set up fully connected layers
        
        # First fully connected layer
        self.fc1 = nn.Linear(16*5*5, 120)
        """
        Input size isfound by ((input width - filter size + 2 * padding) / stride) + 1
            > Read as (input size x input size) I think
        This is found in the above example to be 16*5*5, hence this input size
            > torch.size(4 x 16 x 5 x 5) so 16 x 5 x 5 for each sample
            > Flattening each sample gives us 16*5*5
        """
        # Output size is 120 but can be played with
        
        # Second fully connected layer
        self.fc2 = nn.Linear(120, 84)
        
        # Final fully connected layer
        self.fc3 = nn.Linear(84, 10)
        # 10 different classes so final output size must be 10
        
        """
        The 120 and the 84 here are intermediat and so can be changed / playd about with
        However, the 16*5*5 and the 10 are fixed and cannot be changed
        """
    
    def forward(self, x):
        # First convolutional and pooling layer
        x = self.pool(F.relu(self.conv1(x))) # ReLU activation function applied here
        # NOTE: Activation function does not change the size
        
        # Second convolutional and pooling layer
        x = self.pool(F.relu(self.conv2(x)))
        # Need to flatten it to pass to first fully connected layer
        x = x.view(-1, 16*5*5) # -1 = number of samples in  batch
        
        # First fully connected layer
        x = F.relu(self.fc1(x)) # Activation function called again
        
        # Second fully conneted layer
        x = F.relu(self.fc2(x))
        
        # Final fully connected layer
        x = self.fc3(x)
        # Notice no activation function at the end and no softmax as it is already included in loss
        # ... (criterion)
        
        return x
    
model = ConvNet().to(device)

In [6]:
# Loss function and optimizer

# Loss function (criterion)
# CrossEntropyLoss() used as it is a multi-class classification problem
criterion = nn.CrossEntropyLoss() 

# Optimizer
# Parameters are model parameters and learning rate
# Stochastic graient descent used to optimize model parameters
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 


In [7]:
# Training loop

n_total_steps = len(train_loader) # 12_500

for epoch in range(num_epochs): # Loop over epochs
    for i, (images, labels) in enumerate(train_loader): # Loop over batches
        # Original shape = [4, 3, 32, 32]
        # 4 in each batch
        # 3 color channels
        # 32 x 32 image (32 * 32 = 1024)
        # Input layer has 3 input channels, 6 output channels and 5 kernel size
        
        # Pass images and labels to device
        images = images.to(device)
        labels = labels.to(device)
        
        ## Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        ## Backwards pass
        optimizer.zero_grad() # Empty the gradients
        loss.backward() # Backpropogation
        
        ## Optimizer step
        optimizer.step()
        
        ## Print data
        
        if ((i+1) % 2000 == 0):
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}, Loss: {loss.item():.4f}')
            
print('Finished Training')

Epoch [1/4], Step [2000/12500, Loss: 2.2472
Epoch [1/4], Step [4000/12500, Loss: 2.2879
Epoch [1/4], Step [6000/12500, Loss: 2.3134
Epoch [1/4], Step [8000/12500, Loss: 2.2661
Epoch [1/4], Step [10000/12500, Loss: 2.4327
Epoch [1/4], Step [12000/12500, Loss: 1.7628
Epoch [2/4], Step [2000/12500, Loss: 2.6596
Epoch [2/4], Step [4000/12500, Loss: 2.0702
Epoch [2/4], Step [6000/12500, Loss: 2.3234
Epoch [2/4], Step [8000/12500, Loss: 1.6211
Epoch [2/4], Step [10000/12500, Loss: 1.9212
Epoch [2/4], Step [12000/12500, Loss: 1.9476
Epoch [3/4], Step [2000/12500, Loss: 1.7850
Epoch [3/4], Step [4000/12500, Loss: 2.8360
Epoch [3/4], Step [6000/12500, Loss: 2.4755
Epoch [3/4], Step [8000/12500, Loss: 1.9289
Epoch [3/4], Step [10000/12500, Loss: 1.8415
Epoch [3/4], Step [12000/12500, Loss: 0.6593
Epoch [4/4], Step [2000/12500, Loss: 1.5006
Epoch [4/4], Step [4000/12500, Loss: 1.5051
Epoch [4/4], Step [6000/12500, Loss: 1.5913
Epoch [4/4], Step [8000/12500, Loss: 1.2349
Epoch [4/4], Step [10000/1

In [11]:
# Evaluate model

with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        
        # Max returns (value, index)
        _, predicted = torch.max(outputs, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
        
        for i in range(batch_size):
            label = labels[i]
            pred = predicted[i]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1
            
        accuracy = 100.0 * (n_correct / n_samples)
    
    print(f'Accuracy of the network: {accuracy} %')
        
    for i in range(10):
        accuracy = 100.0 * (n_class_correct[i] / n_class_samples[i])
        print(f'Accuracy of {classes[i]}: {accuracy} %')

Accuracy of the network: 46.23 %
Accuracy of plane: 53.6 %
Accuracy of car: 50.4 %
Accuracy of bird: 32.9 %
Accuracy of cat: 18.3 %
Accuracy of deer: 32.1 %
Accuracy of dog: 44.1 %
Accuracy of frog: 60.4 %
Accuracy of horse: 67.0 %
Accuracy of ship: 57.49999999999999 %
Accuracy of truck: 46.0 %


In [None]:
"""
Network sucks because too few epochs, try more epochs or maybe change other hyperparameters
"""