### This notebook used the course's material up to lecture 5

##### Batch processing
##### Data normalization
##### Optimizers (SGD, "vanilla" SGD, Adam)
##### Cross-entropy loss
##### Softmax and Relu activation functions
##### Two different architectures (one or two hidden layers)

In [502]:
import torch
from torch import Tensor
from torch import optim
from torch import nn
from torch.nn import functional as F
from torch.optim.lr_scheduler import LambdaLR
import dlc_practical_prologue as prologue

In [503]:
### Variables
# Size of data
size = 1000
# Batch size
batch_size = 100
# Number of hidden layers
n_hidden_layers = 50
# Criterion
criterion = torch.nn.CrossEntropyLoss()
# Learning rate (step size)
eta = 1e-1
# Number of epochs
nb_epochs = 500
# Parameter for loss computation
lambda_ = 1

In [504]:
# Load the data
train_input, train_target, train_classes, test_input, test_target, test_classes = \
prologue.generate_pair_sets(size)

In [505]:
# Function to one hot encode the class (not needed for now)
def classes_to_one_hot(class_):
    res = torch.zeros(10)#,dtype=int)
    d1 = class_
    res[d1.item()] = 1
    return res

In [520]:
# Fully connected neural network
class FNN(torch.nn.Module):
    def __init__(self, n_hidden, two_hidden_layers = False):
        super(FNN, self).__init__()
        self.two_hidden_layers = two_hidden_layers
        
        # Case 1 hidden layer
        if (not two_hidden_layers):
    
            # Fully connected layer 1: 196 (14*14) input pixels for each image -> number of hidden nodes
            self.fc1 = nn.Linear(196, n_hidden)

            # Fully connected layer 2: number of hidden nodes -> 10 outputs to recognize the digits
            self.fc2 = nn.Linear(n_hidden, 10)

            # Last fully connected layer, which predicts if the first number is smaller or equal to the second
            self.fc3 = nn.Linear(20, 2)
        
        # Case 2 hidden layers
        else :
            
            # Fully connected layer 1: 196 (14*14) input pixels for each image -> number of hidden nodes
            self.fc1 = nn.Linear(196, n_hidden)

            # Fully connected layer 2: number of hidden nodes -> number of hidden nodes
            self.fc2 = nn.Linear(n_hidden, n_hidden)

            # Fully connected layer 3: number of hidden nodes -> 10 outputs to recognize the digits
            self.fc3 = nn.Linear(n_hidden, 10)

            # Last fully connected layer, which predicts if the first number is smaller or equal to the second
            self.fc4 = nn.Linear(20, 2)
        

    def forward(self, x1, x2):
        # We separate the two images to feed them through the network separately
        
        # For the network to train, it needs to learn :
            # Whether each of the predicted digits are right
            # Whether its predicted ordering is right
            
        # Thus, we need to return through this forward function 
            # the two digits prediction as well as the ordering prediction
            
        # Conventions :
            # For the digit prediction, the index of the maximum response corresponds to the digit
            # For the ordering prediction, if the first number is smaller or equal to the second, predict 1 (true)
        
        
        # Case 1 hidden layer
        if (not self.two_hidden_layers):
            
            ### Forward-pass
            
            ## Image 1
            # Relu for non linearity
            x1 = F.relu(self.fc1(x1))        
            # Softmax for the digit classification
            x1 = F.softmax(self.fc2(x1))   
            
            ## Image 2
            # Relu for non linearity
            x2 = F.relu(self.fc1(x2)) 
            # Softmax for the digit classification
            x2 = F.softmax(self.fc2(x2))            

            # Concat them before moving on to the last layer
            x1x2 = torch.cat((x1,x2),1)                             
            y = F.softmax(self.fc3(x1x2))
        
        #Case 2 hidden layers
        else :
            
            ### Forward-pass
            
            ## Image 1
            # Relu for non linearity
            x1 = F.relu(self.fc1(x1))        
            x1 = F.relu(self.fc2(x1))   
            # Softmax for the digit classification
            x1 = F.softmax(self.fc3(x1))
            
            ## Image 2
            # Relu for non linearity
            x2 = F.relu(self.fc1(x2)) 
            x2 = F.relu(self.fc2(x2)) 
            # Softmax for the digit classification
            x2 = F.softmax(self.fc3(x2))   

            # Concat them before moving on to the last layer
            x1x2 = torch.cat((x1,x2),1)                             
            y = F.softmax(self.fc4(x1x2))
        
        return x1, x2, y

In [526]:
def train_model(model, train_input, train_target, train_classes, lambda_, nb_epochs, optimizer):
    # param : lamba_ is the coefficient used for giving more or less importance to the digit loss compared to the ordering loss
    
    for e in range(nb_epochs):
        for b in range(0, train_input.size(0), batch_size):
            
            # Data normalization
            mu, std = train_input.mean(), train_input.std()
            train_input.sub_(mu).div_(std)
            
            # Retrieving the corresponding train batch
            batch_train = train_input.narrow(0,b,batch_size)
            # Batch of pixels of the first images
            train_1 = batch_train.narrow(1,0,1).view(batch_size,-1)         
            # Batch of pixels of the second images
            train_2 = batch_train.narrow(1,1,1).view(batch_size,-1)
            
            # Retrieving the corresponding class batch
            batch_classes = train_classes.narrow(0,b,batch_size)
            # Batch of the classes of the first images
            classes_1 = batch_classes.narrow(1,0,1).flatten()
            # Batch of the classes of the second images
            classes_2 = batch_classes.narrow(1,1,1).flatten()
            
            ### Predictions & Loss
            d1,d2,pred = model(train_1, train_2)
            # Compute the loss for the first digit prediction
            d1_loss = criterion(d1, classes_1)
            # Compute the loss for the second digit prediction
            d2_loss = criterion(d2, classes_2)
            # Compute the loss for the ordering prediction
            pred_loss = criterion(pred, train_target.narrow(0,b,batch_size))
            
            # Reinitialize to 0 the gradients
            model.zero_grad()
            
            # Backward pass
            loss = (d1_loss + d2_loss)/2 + pred_loss*lambda_
            loss.backward()
            
            # Update
            optimizer.step()
            
        # Display the loss
        print("epoch =", e, ", loss = ",loss.item())    
        
        
def compute_nb_errors(model, test_input, test_target, test_classes):
    nb_errors_img1 = 0
    nb_errors_img2 = 0
    nb_errors_pred = 0
    data_size = 0
    
    for b in range(0, train_input.size(0), batch_size):

        # Retrieving the corresponding test batch
        batch_test = test_input.narrow(0,b,batch_size)
        # Batch of pixels of the first images
        test_1 = batch_test.narrow(1,0,1).view(batch_size,-1)         
        # Batch of pixels of the second images
        test_2 = batch_test.narrow(1,1,1).view(batch_size,-1)

        # Retrieving the corresponding class batch
        batch_classes = test_classes.narrow(0,b,batch_size)
        # Batch of the classes of the first images
        classes_1 = batch_classes.narrow(1,0,1).flatten()
        # Batch of the classes of the second images
        classes_2 = batch_classes.narrow(1,1,1).flatten()

        # Prediction
        d1,d2,pred = model(test_1, test_2)
        
        # Translate the predictions values to predicted classes
        d1_classes = d1.max(1).indices
        d2_classes = d2.max(1).indices
        pred_classes = pred.max(1).indices
        
        # Compute the number of errors
        for i in range(b, b + batch_size):
            data_size += 1
            if (d1_classes[i-b] != classes_1[i-b]):
                nb_errors_img1 += 1
            if (d2_classes[i-b] != classes_2[i-b]):
                nb_errors_img2 += 1
            if (pred_classes[i-b] != test_target[i]):
                nb_errors_pred += 1
        
    print("Number of total errors on image 1 classification : ", nb_errors_img1)
    print("Accuracy on image 1 classification : ", 100-nb_errors_img1/data_size*100, "%")
    print("Number of total errors on image 2 classification : ", nb_errors_img2)
    print("Accuracy on image 2 classification : ", 100-nb_errors_img2/data_size*100, "%")
    print("Number of total errors on ordering prediction : ", nb_errors_pred)
    print("Accuracy on ordering prediction : ", 100-nb_errors_pred/data_size*100, "%")
        

In [527]:
###Tests

# Model
model = FNN(n_hidden_layers, two_hidden_layers=False)

### Optimizers
# optimizer = optim.Adam(model.parameters(), lr = 1e-3)
# optimizer = optim.SGD(model.parameters(), lr = eta)
optimizer = optim.SGD(model.parameters(), lr = eta, momentum = 0.9)

# Train and error computation
train_model(model, train_input, train_target, train_classes, lambda_, nb_epochs, optimizer)
compute_nb_errors(model, test_input, test_target, test_classes)



epoch = 0 , loss =  2.979917526245117
epoch = 1 , loss =  2.885608434677124
epoch = 2 , loss =  2.6408355236053467
epoch = 3 , loss =  2.5096065998077393
epoch = 4 , loss =  2.4266695976257324
epoch = 5 , loss =  2.3399364948272705
epoch = 6 , loss =  2.2785463333129883
epoch = 7 , loss =  2.2419209480285645
epoch = 8 , loss =  2.215785026550293
epoch = 9 , loss =  2.1931910514831543
epoch = 10 , loss =  2.1758363246917725
epoch = 11 , loss =  2.1555418968200684
epoch = 12 , loss =  2.1368677616119385
epoch = 13 , loss =  2.1256942749023438
epoch = 14 , loss =  2.1146786212921143
epoch = 15 , loss =  2.1062557697296143
epoch = 16 , loss =  2.1007440090179443
epoch = 17 , loss =  2.10137939453125
epoch = 18 , loss =  2.1059203147888184
epoch = 19 , loss =  2.0989513397216797
epoch = 20 , loss =  2.092463493347168
epoch = 21 , loss =  2.0866806507110596
epoch = 22 , loss =  2.0774424076080322
epoch = 23 , loss =  2.081052541732788
epoch = 24 , loss =  2.0794808864593506
epoch = 25 , loss