### This notebook used the course's material up to lecture 5

##### Batch processing
##### Data normalization
##### Optimizers (SGD, "vanilla" SGD, Adam)
##### Cross-entropy loss
##### Softmax and Relu activation functions
##### Two different architectures (one or two convolutional layers)
##### Convolutions
##### Pooling

In [105]:
import torch
from torch import Tensor
from torch import optim
from torch import nn
from torch.nn import functional as F
import dlc_practical_prologue as prologue

In [106]:
### Variables
# Size of data
size = 1000
# Batch size
batch_size = 100
# Number of hidden layers
n_hidden_layers = 50
# Criterion
criterion = torch.nn.CrossEntropyLoss()
# Learning rate (step size)
eta = 1e-1
# Number of epochs
nb_epochs = 500
# Parameter for loss computation
lambda_ = 1

In [107]:
# Load the data
train_input, train_target, train_classes, test_input, test_target, test_classes = \
prologue.generate_pair_sets(size)

In [108]:
# Function to one hot encode the class (not needed for now)
def classes_to_one_hot(class_):
    res = torch.zeros(10)#,dtype=int)
    d1 = class_
    res[d1.item()] = 1
    return res

In [109]:
# Fully connected neural network
class CNN(torch.nn.Module):
    def __init__(self, n_hidden, two_conv_layers = False):
        super(CNN, self).__init__()
        self.two_conv_layers = two_conv_layers
        
        # Case 1 convolutional layer
        if (not two_conv_layers):
            
            # Convolutional layer 1: 1*14*14 -> 10*12*12
            self.conv1 = nn.Conv2d(1, 10, kernel_size = 3)
            
            # Fully connected layer 1: 360 (10*6*6) -> number of hidden nodes
            self.fc1 = nn.Linear(360, n_hidden)
            
            # Fully connected layer 2: number of hidden nodes -> 10 outputs to recognize the digits
            self.fc2 = nn.Linear(n_hidden, 10)
            
            # Last fully connected layer, which predicts if the first number is smaller or equal to the second
            self.fc3 = nn.Linear(20,2)
        
        # Case 2 convolutional layers
        else :
            
            # Convolutional layer 1: 1*14*14 -> 10*12*12
            self.conv1 = nn.Conv2d(1, 10, kernel_size = 3)
            
            # Convolutional layer 2: 10*6*6 -> 10*4*4
            self.conv2 = nn.Conv2d(10, 50, kernel_size = 3)
            
            # Fully connected layer 1: 200 (50*2*2) -> number of hidden nodes
            self.fc1 = nn.Linear(200, n_hidden)
            
            # Fully connected layer 2: number of hidden nodes -> 10 outputs to recognize the digits
            self.fc2 = nn.Linear(n_hidden, 10)
            
            # Last fully connected layer, which predicts if the first number is smaller or equal to the second
            self.fc3 = nn.Linear(20,2)
        

    def forward(self, x1, x2):
        # We separate the two images to feed them through the network separately
        
        # For the network to train, it needs to learn :
            # Whether each of the predicted digits are right
            # Whether its predicted ordering is right
            
        # Thus, we need to return through this forward function 
            # the two digits prediction as well as the ordering prediction
            
        # Conventions :
            # For the digit prediction, the index of the maximum response corresponds to the digit
            # For the ordering prediction, if the first number is smaller or equal to the second, predict 1 (true)
        
        
        # Case 1 hidden layer
        if (not self.two_conv_layers):
            
            ### Forward-pass
            
            ## Image 1
            # Max-pooling with kernel_size and stride of 2 : 12*12 -> 6*6 & Relu
            x1 = F.relu(F.max_pool2d(self.conv1(x1), kernel_size=2, stride=2))
            # Relu for non linearity
            x1 = F.relu(self.fc1(x1.view(-1,360)))
            # Softmax for the digit classification
            x1 = F.softmax(self.fc2(x1))
            
            ## Image 2
            # Max-pooling with kernel_size and stride of 2 : 12*12 -> 6*6 & Relu
            x2 = F.relu(F.max_pool2d(self.conv1(x2), kernel_size=2, stride=2))
            # Relu for non linearity
            x2 = F.relu(self.fc1(x2.view(-1,360)))
            # Softmax for the digit classification
            x2 = F.softmax(self.fc2(x2))  

            # Concat them before moving on to the last layer
            x1x2 = torch.cat((x1,x2),1)                             
            y = F.softmax(self.fc3(x1x2))
        
        #Case 2 hidden layers
        else :

            ### Forward-pass
            
            ## Image 1
            # Max-pooling with kernel_size and stride of 2 : 12*12 -> 6*6 & Relu
            x1 = F.relu(F.max_pool2d(self.conv1(x1), kernel_size=2, stride=2))
            # Max-pooling with kernel_size and stride of 2 : 4*4 -> 2*2 & Relu
            x1 = F.relu(F.max_pool2d(self.conv2(x1), kernel_size=2, stride=2))
            # Relu for non linearity
            x1 = F.relu(self.fc1(x1.view(-1,200)))
            # Softmax for the digit classification
            x1 = F.softmax(self.fc2(x1))
            
            ## Image 2
            # Max-pooling with kernel_size and stride of 2 : 12*12 -> 6*6 & Relu
            x2 = F.relu(F.max_pool2d(self.conv1(x2), kernel_size=2, stride=2))
            # Max-pooling with kernel_size and stride of 2 : 4*4 -> 2*2 & Relu
            x2 = F.relu(F.max_pool2d(self.conv2(x2), kernel_size=2, stride=2))
            # Relu for non linearity
            x2 = F.relu(self.fc1(x2.view(-1,200)))
            # Softmax for the digit classification
            x2 = F.softmax(self.fc2(x2))  

            # Concat them before moving on to the last layer
            x1x2 = torch.cat((x1,x2),1)                             
            y = F.softmax(self.fc3(x1x2))
        
        return x1, x2, y

In [110]:
def train_model(model, train_input, train_target, train_classes, lambda_, nb_epochs, optimizer):
    # param : lamba_ is the coefficient used for giving more or less importance to the digit loss compared to the ordering loss
    
    for e in range(nb_epochs):
        for b in range(0, train_input.size(0), batch_size):
            
            # Data normalization
            mu, std = train_input.mean(), train_input.std()
            train_input.sub_(mu).div_(std)
            
            # Retrieving the corresponding train batch
            batch_train = train_input.narrow(0,b,batch_size)
            # Batch of pixels of the first images
            train_1 = batch_train.narrow(1,0,1)    
            # Batch of pixels of the second images
            train_2 = batch_train.narrow(1,1,1)
            
            # Retrieving the corresponding class batch
            batch_classes = train_classes.narrow(0,b,batch_size)
            # Batch of the classes of the first images
            classes_1 = batch_classes.narrow(1,0,1).flatten()
            # Batch of the classes of the second images
            classes_2 = batch_classes.narrow(1,1,1).flatten()
                        
            ### Predictions & Loss
            d1,d2,pred = model(train_1, train_2)
            # Compute the loss for the first digit prediction
            d1_loss = criterion(d1, classes_1)
            # Compute the loss for the second digit prediction
            d2_loss = criterion(d2, classes_2)
            # Compute the loss for the ordering prediction
            pred_loss = criterion(pred, train_target.narrow(0,b,batch_size))
            
            # Reinitialize to 0 the gradients
            model.zero_grad()
            
            # Backward pass
            loss = (d1_loss + d2_loss)/2 + pred_loss*lambda_
            loss.backward()
            
            # Update
            optimizer.step()
            
        # Display the loss
        print("epoch =", e, ", loss = ",loss.item())    
        
        
def compute_nb_errors(model, test_input, test_target, test_classes):
    nb_errors_img1 = 0
    nb_errors_img2 = 0
    nb_errors_pred = 0
    data_size = 0
    
    for b in range(0, train_input.size(0), batch_size):

        # Retrieving the corresponding test batch
        batch_test = test_input.narrow(0,b,batch_size)
        # Batch of pixels of the first images
        test_1 = batch_test.narrow(1,0,1)     
        # Batch of pixels of the second images
        test_2 = batch_test.narrow(1,1,1)

        # Retrieving the corresponding class batch
        batch_classes = test_classes.narrow(0,b,batch_size)
        # Batch of the classes of the first images
        classes_1 = batch_classes.narrow(1,0,1).flatten()
        # Batch of the classes of the second images
        classes_2 = batch_classes.narrow(1,1,1).flatten()

        # Prediction
        d1,d2,pred = model(test_1, test_2)
        
        # Translate the predictions values to predicted classes
        d1_classes = d1.max(1).indices
        d2_classes = d2.max(1).indices
        pred_classes = pred.max(1).indices
        
        # Compute the number of errors
        for i in range(b, b + batch_size):
            data_size += 1
            if (d1_classes[i-b] != classes_1[i-b]):
                nb_errors_img1 += 1
            if (d2_classes[i-b] != classes_2[i-b]):
                nb_errors_img2 += 1
            if (pred_classes[i-b] != test_target[i]):
                nb_errors_pred += 1
        
    print("Number of total errors on image 1 classification : ", nb_errors_img1)
    print("Accuracy on image 1 classification : ", 100-nb_errors_img1/data_size*100, "%")
    print("Number of total errors on image 2 classification : ", nb_errors_img2)
    print("Accuracy on image 2 classification : ", 100-nb_errors_img2/data_size*100, "%")
    print("Number of total errors on ordering prediction : ", nb_errors_pred)
    print("Accuracy on ordering prediction : ", 100-nb_errors_pred/data_size*100, "%")
        

In [111]:
###Tests

# Model
model = CNN(n_hidden_layers, two_conv_layers=False)

### Optimizers
# optimizer = optim.Adam(model.parameters(), lr = 1e-3)
# optimizer = optim.SGD(model.parameters(), lr = eta)
optimizer = optim.SGD(model.parameters(), lr = eta, momentum = 0.9)

# Train and error computation
train_model(model, train_input, train_target, train_classes, lambda_, nb_epochs, optimizer)
compute_nb_errors(model, test_input, test_target, test_classes)



epoch = 0 , loss =  2.973578453063965
epoch = 1 , loss =  2.8664662837982178
epoch = 2 , loss =  2.5590877532958984
epoch = 3 , loss =  2.4093241691589355
epoch = 4 , loss =  2.3336148262023926
epoch = 5 , loss =  2.307248830795288
epoch = 6 , loss =  2.294355630874634
epoch = 7 , loss =  2.290672779083252
epoch = 8 , loss =  2.2490901947021484
epoch = 9 , loss =  2.2316231727600098
epoch = 10 , loss =  2.2420172691345215
epoch = 11 , loss =  2.245359420776367
epoch = 12 , loss =  2.2704901695251465
epoch = 13 , loss =  2.215036153793335
epoch = 14 , loss =  2.219799518585205
epoch = 15 , loss =  2.207732677459717
epoch = 16 , loss =  2.2058193683624268
epoch = 17 , loss =  2.216407537460327
epoch = 18 , loss =  2.1901345252990723
epoch = 19 , loss =  2.1795897483825684
epoch = 20 , loss =  2.1836776733398438
epoch = 21 , loss =  2.199657917022705
epoch = 22 , loss =  2.169644832611084
epoch = 23 , loss =  2.189760208129883
epoch = 24 , loss =  2.175579786300659
epoch = 25 , loss =  2.