# Image Classification with Convolutional Neural Networks

## 0. Imports

In [None]:
import torch
import torchvision #has various utils functions, including loading datasets
import torchvision.transforms as transforms #common image transformations

import torch.nn as nn #creating neural network
import torch.nn.functional as F #functional api for layers,...
import torch.optim as optim #for optimization algorithm

import matplotlib.pyplot as plt #for visualization
import numpy as np #for basic array operations

from torch.utils.data import DataLoader

## 1. Load Data - CIFAR10

In [None]:
#transform data to tensor and normalize with mean=0.5 and standard deviation=0.5 for each channel
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

#how many samples per batch to load
batch_size = 4

#load training set and apply transform
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

In [None]:
#load test set and apply transform
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

#CIFAR10 has the following 10 classes:
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
print("Training set: ",trainset.data.shape)
print("Test set: ", testset.data.shape)

#### Plot images

In [None]:
# functions to show an image

def imshow(img):
    img = img / 2 + 0.5     #because images were normalized when loaded, for visualization purposes we unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0))) #reorder the channels 
    plt.show()




In [None]:
# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(batch_size)))
# show images
imshow(torchvision.utils.make_grid(images))

## 2. Create and Train a Convolutional Neural Network (CNN)

In [None]:
#Set up GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

#### Create a CNN

Convolution and pooling are fundamental operations for building CNN models. There are a number of parameters and if their definitions are not clear, it could lead to great confusion.
- Parameters for convolution (pooling) layers
  - **stride:** how many "steps" that the filter makes for each advance
  - **kernel size**: how large is the kernel (filter) is
  - **number of filters (channels):** designates the "depth" of the data. Most image inputs have three filters (RGB)
  - **padding:** how to pad the input sample with zero in the border
- How to calculate output size of convolution/pooling operation
  <br> 
*(W - F + 2P)/S + 1* <br>
  - *W*: input size
  - *F*: kernel size
  - *P*: padding 
  - *S*: stride


> The 2D convolution is a fairly simple operation at heart: you start with a kernel, which is simply a small matrix of weights. This kernel “slides” over the 2D input data, performing an elementwise multiplication with the part of the input it is currently on, and then summing up the results into a single output pixel. - [Source](https://towardsdatascience.com/intuitively-understanding-convolutions-for-deep-learning-1f6f42faee1)

<img src="https://miro.medium.com/max/1070/1*Zx-ZMLKab7VOCQTxdZ1OAw.gif" style="max-width:400px;">

The Max Pooling 2D   
![alt text](https://user-images.githubusercontent.com/22738317/34081046-c3a97518-e347-11e7-98fe-929f602ee857.png)

Let's test the pooling operations on some examples

In [None]:
t = torch.tensor([[[0,0,1],
                   [1,2,3],
                   [7,5,3],
                   [5,3,6]],
                  
                  [[9,11,10],
                   [10,11,12],
                   [31,55,32],
                   [17,29,32]]], dtype=torch.float64, requires_grad=True)

# t = torch.tensor([[[0,0,1],
#                    [1,2,3],
#                    [7,5,3],
#                    [5,3,6]]], dtype=torch.float64, requires_grad=True)

print(t.shape)
print(t)

t2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=1)
# t2 = nn.AvgPool2d(kernel_size=2, stride=1, padding=1)

print()
# print(t2(t).shape)
# print(t2(t))

t3 = torch.flatten(t, 1)
print()
print(t3.shape)
print(t3)

In [None]:
# Convolutional neural network (two convolutional layers + one fully connected layer)
"""nn.Conv2d(in_channels, out_channels, kernel_size)"""
"""nn.MaxPool2d(kernel_size, stride)"""
"""nn.Linear(in_features, out_features)"""

class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        # super(ConvNet, self).__init__()
        super().__init__()
        #First convolutional layer
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        #Second convolutional layer
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        #Fully connected layer with 2 hidden layers + output. Output has 10 neurons, 1 for each class
        self.layer3 = nn.Sequential(
            nn.Linear(16 * 5 * 5, 120),
            nn.Linear(120, 84),
            nn.Linear(84, 10)
        )
        
    def forward(self, x):
        out = self.layer1(x) 
        out = self.layer2(out) 
        out = torch.flatten(out, 1) # flatten all dimensions except batch
        out = self.layer3(out)
        return out

convnet = ConvNet(10).to(device) #set up for GPU if available

## Optimizers - [source](https://medium.com/@Biboswan98/optim-adam-vs-optim-sgd-lets-dive-in-8dbf1890fbdc)
* **S**tochastic **G**radient **D**escent 
* **ADA**ptive **M**oment optimizer \\


In [None]:
criterion = nn.CrossEntropyLoss()
#optimizer = optim.SGD(convnet.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(convnet.parameters(), lr=0.001)

#### Train

In [None]:
NUM_EPOCHS = 3
for epoch in range(NUM_EPOCHS):  # loop over the dataset multiple times
    print(f'Epoch {epoch+1}:')
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        #inputs, labels = data
        inputs, labels = data[0].to(device), data[1].to(device) #set for GPU if available

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = convnet(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step() # updating the weights of the network

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'Iteration {i+1}, loss = {(running_loss / 2000):.3f}')
            running_loss = 0.0

print('Finished Training')

#### Save trained model

In [None]:
PATH = './cifar_net.pth'
torch.save(convnet.state_dict(), PATH)

#### Load saved model and predict some images

In [None]:
dataiter = iter(testloader)
data = next(dataiter)
images = data[0]
labels = data[1]
# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

#images, labels = data[0].to(device), data[1].to(device)

convnet = ConvNet()#.to(device)
convnet.load_state_dict(torch.load(PATH))

outputs = convnet(images)

_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                              for j in range(4)))

#### Accuracy on Test set

In [None]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in testloader:
        images, labels = data
        #inputs, labels = data[0].to(device), data[1].to(device) #set for GPU if available
        # calculate outputs by running images through the network
        outputs = convnet(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

#### Accuracy on test set per class

In [None]:
# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in testloader:
        images, labels = data
        #inputs, labels = data[0].to(device), data[1].to(device) #set for GPU if available
        outputs = convnet(images)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
                                                   accuracy))

# Your turn. Choose different dataset, modify the architecture and do experiment on it, then see the result.