In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils as utils
from torch.utils.data import Dataset, DataLoader,random_split
from torch.nn import functional as F
import numpy as np
import random
import warnings
warnings.filterwarnings('ignore')
import matplotlib
import matplotlib.pyplot as plt
from torchvision import transforms
import torchvision.transforms as transforms
import torchvision.models as models
import os
from time import sleep
from IPython.display import clear_output
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
from torchvision import datasets
import torchvision
import mat73
from auxilery_functions_and_classes import *
matplotlib.use('Agg')
#%matplotlib inline
%matplotlib tk

## 3. Deep Convolutional Network (using VGG13), Classifing CIFAR10

Instructions:

1. Complete all the missing code until the Transfer Learning section.
2. Run the code. Do you get a good performance? What is the problem in the process?
3. Start your journey for the best performance with the scheduler. Try to improve it. Document your experiments.
4. Now you will try adding Weight Decay. Try three numbers in the range [0.0005 - 0.005]. Document your experiments.
5. The next step will be using Dropout. Try small probability values [0 - 0.25]. Document your experiments.
6. The last addition will be augmentations. Be careful! try one at a time, and only combine two or more when you see improvments. Document your experiments.
7. What is the best performance that you got?

In [2]:
torch.cuda.empty_cache()

# Setting fixed seeds
seed = 0 
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.use_deterministic_algorithms(True)
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.benchmark=False

In [3]:
# Defining a Params object for this part
hparams = Hyper_Params()

In [4]:
# This is the augmentations cell. You will use this later in this assignment









augmentations = [#transforms.RandomHorizontalFlip(p=0.5), 
                    transforms.RandomVerticalFlip(p=0.5), 
                    transforms.RandomRotation(degrees=(-10,10)), 
                    #transforms.RandomPerspective(distortion_scale=0.5,p=0.5), 
                    transforms.GaussianBlur(kernel_size=(3,3), sigma=(0.1, 1.0)), 
                    transforms.ColorJitter(brightness=(0.25,0.75), contrast=(0.25,0.75)), 
                    #transforms.RandomAffine(scale=(.9, 1.1)) 
                    ] 


In [5]:
# Creating a Cifar class from cifar10.mat
train = torchvision.datasets.CIFAR10(root="cifar10",train=True,download=True)
test = torchvision.datasets.CIFAR10(root="cifar10",train=False,download=True)
train_data = torch.tensor(train.data).to(float())/255
test_data=torch.tensor(test.data).to(float())/255
mean_imn = torch.mean(train_data, dim=(0, 1, 2))
std_imn = torch.std(train_data, dim=(0, 1, 2))
print("means:",[round(value, 3) for value in mean_imn.tolist()],"standard diviation:",[round(value, 3) for value in std_imn.tolist()]) 


# TODO: Define the required transforms for the train and the test sets: convert the images to tensors and do normalization 
hparams.train_transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize(mean_imn, std_imn)]+augmentations) 
hparams.test_transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize(mean_imn, std_imn)]) 
class_dict = {0:'airplane', 1:'automobile', 2:'bird', 3:'cat', 4:'deer', 5:'dog', 6:'frog', 7:'horse', 8:'ship', 9:'truck'}

Files already downloaded and verified
Files already downloaded and verified
means: [0.491, 0.482, 0.447] standard diviation: [0.247, 0.243, 0.262]


In [6]:
#Apply transformations
train.transform = hparams.train_transform 
test.transform = hparams.test_transform 

In [7]:
#Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Defining hyper-parameters




# TODO: Set the learning rate (try around 0.03) and the momentum (start with 0.9) 
hparams.lr =  0.03 
hparams.momentum = 0.9 # in the SGDM algorithm, we use momentum to accelrate convergence by adding a fraction of the previous gradient step, thus overcome local minima and oscillations. 
# TODO: Define the dropout probability (0.0 at first) 
hparams.dropout_probability = 0.15

hparams.epochs = 12 
hparams.batch_size = 250

hparams.scheduler_step_size = 8 # the number of epochs between each scheduler step 
hparams.scheduler_factor = 0.3 # the factor by which the scheduler changes the learning rate 
hparams.iter_break = 100 # number of iterations between validation steps
hparams.space = 150 # for visualisation
hparams.lines = 200 # for visualisation

In [8]:
# Defining a VGG block
class VGGBlock(nn.Module):
   def __init__(self,input_size,output_size):
      super(VGGBlock,self).__init__()
      








      self.Block = nn.Sequential(torch.nn.Conv2d(in_channels=input_size,out_channels=output_size, padding=1, kernel_size=(3,3), padding_mode="zeros"), # TODO: complete a convolution layer with kernel size 3x3 and padding size of 1 
                                 torch.nn.BatchNorm2d(output_size), # TODO: complete a batch normalization layer 
                                 torch.nn.ReLU(inplace=True), # TODO: complete an activation layer 
                                 torch.nn.Conv2d(in_channels=output_size,out_channels=output_size, padding=1, kernel_size=(3,3), padding_mode="zeros"), # TODO: complete a convolution layer with kernel size 3x3 and padding size of 1 
                                 torch.nn.BatchNorm2d(output_size), # TODO: complete a batch normalization layer 
                                 torch.nn.ReLU(inplace=True), # TODO: complete an activation layer
                                 torch.nn.MaxPool2d((2,2))  # TODO: complete a max pooling layer with kernal size 2 and stride 2 
      ) 
 
      
   def forward(self,x):
       return self.Block(x)

In [9]:
# Defining VGG13 using VGG Blocks
class VGG13(nn.Module):
    def __init__(self,input_size:tuple,channels:list,output_size:int,p=0):
      super(VGG13,self).__init__()
      self.vggblocks = nn.Sequential(VGGBlock(input_size[0],channels[0]), # 16x16
                                      VGGBlock(channels[0],channels[1]),  # 8x8
                                      VGGBlock(channels[1],channels[2]),  # 4x4
                                      VGGBlock(channels[2],channels[3]),  # 2x2
                                      VGGBlock(channels[3],channels[4]))  # 1x1
      



      
      # TODO: Define a dropout layer with probability p 
      self.dropout = torch.nn.Dropout(p) 
      # TODO: Define the final linear layer 
      self.linear = torch.nn.Linear(in_features=channels[4]*1, out_features=output_size) 
      # TODO: Define a flatten layer 
      self.flatten = torch.nn.Flatten()
     
      
    def forward(self,x):
       




      # TODO: Complete the vggblocks sequence 
      z1 = self.vggblocks(x) 
      # TODO: Complete the dropout layer 
      z2 = self.dropout(z1) 
      # TODO: Complete the flatten of z2 
      z3 = self.flatten(z2) 
      # TODO: Complete the final linear layer 
      out = self.linear(z3) 
    
      return out

In [10]:
hparams.weight_decay = 0.0005

In [11]:
# Define the model, optimizer, loss criterion and Scheduler and Data loaders
model = VGG13(input_size=(3,32,32),channels=[64,128,256,512,512],output_size=10,p=hparams.dropout_probability).to(device) 

# TODO: Define the optimizer using sgd with momentum. Add weight decay here 
optimizer = torch.optim.SGD(params=model.parameters(), lr=hparams.lr, momentum= hparams.momentum, weight_decay=hparams.weight_decay) 

# TODO: Define the criterion using cross entropy 
criterion = torch.nn.CrossEntropyLoss() 

# TODO: Define the learning rate scheduler of type step 
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=hparams.scheduler_step_size, gamma=hparams.scheduler_factor)

# TODO: Define dataloaders (from torch) for the train and test sets, set the num_workers to 16 and use shuffle for training 
train_loader = torch.utils.data.DataLoader(train, hparams.batch_size, shuffle=True, num_workers=16)
test_loader = torch.utils.data.DataLoader(test, hparams.batch_size, shuffle=False, num_workers=16) 

In [12]:
# Evaluation function
def evaluate(test_loader,model,criterion,acc_function):
    with torch.no_grad():
        loss = 0
        acc = 0
        for i,batch in enumerate(test_loader):
            data,target = batch
            output = model(data.to(device))
            batch_loss = criterion(output,target.to(device))
            batch_acc = acc_function(output,target.to(device))
            loss+= batch_loss
            acc+=batch_acc
        return loss.item()/len(test_loader),acc/len(test_loader)

In [13]:
def train_with_scheduler(hparams,train_loader:DataLoader,test_loader,model:nn.Module,optimizer:torch.optim,scheduler:torch.optim.lr_scheduler,criterion,acc_function):
      hparams.fig, (hparams.ax1, hparams.ax2) = plt.subplots(2, 1, figsize=(15, 9))
      print_preformance_grid(Flag=True)
      
      for epoch in range(hparams.epochs):
            
            model.train()
            hparams.epoch_accuracy_train = np.zeros(len(train_loader))
            hparams.epoch_loss_train = np.zeros(len(train_loader))
      
            for i,batch in enumerate(train_loader):

                # TODO: reset the optimizer gradient 
                optimizer.zero_grad() 
                data,target = batch
                target = target.reshape(target.shape[0]).to(device)
                output = model(data.to(device))

                # TODO: Use the criterion on the output and target to get the train loss 
                loss = criterion(output,target)

                # TODO: Complete the accuracy calculation 
                accuracy = acc_function(output, target) 


                # TODO: complete the backward 
                loss.backward()

                # TODO: complete the algorithm step 
                optimizer.step()
               
                hparams.epoch_accuracy_train[i] = accuracy
                hparams.epoch_loss_train[i] = loss.item()
                
                if (i==0 and epoch ==0) or (i+1) % hparams.iter_break==0:
                    torch.cuda.empty_cache()
                    model.eval()
                    test_loss,test_accuracy = evaluate(test_loader,model,criterion,acc_function)                  
                    print_performance(epoch,i,hparams,test_loss,test_accuracy)     
                    model.train()

            # TODO: complete the scheduler step 
            scheduler.step() 

      plt.show()

In [14]:
# Training
train_with_scheduler(hparams=hparams,train_loader=train_loader,test_loader=test_loader,model=model,optimizer=optimizer,scheduler=scheduler,criterion=criterion,acc_function=multi_class_accuracy)# to full writ
present_confusion_matrix(model,test_loader,10,class_dict)

| Epoch | Batch |      Train Loss      | Train Accuracy | Test Loss | Test Accuracy |
-------------------------------------------------------------------------------------
|   0   |   0   |         2.6          |      0.1       |    2.3    |      0.1      |
-------------------------------------------------------------------------------------
|   1   |  100  |         2.67         |      0.12      |   38.97   |     0.11      |
-------------------------------------------------------------------------------------
|   1   |  200  |         2.45         |      0.15      |   55.22   |      0.1      |
-------------------------------------------------------------------------------------
|   2   |  100  |         2.13         |      0.2       |   13.12   |      0.1      |
-------------------------------------------------------------------------------------
|   2   |  200  |         2.1          |      0.21      |   13.38   |      0.1      |
------------------------------------------------------

## 4. Transfer Learning using VGG16, on Desserts dataset

Instructions:

1. No code to complete in this section. Just run it!
2. You do need to improve the performance though... Use what you learned and see where you get. Document your experiments.
3. What is your best result?

In [15]:
torch.cuda.empty_cache()

# Setting fixed seeds
seed = 6 
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.use_deterministic_algorithms(True)
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.benchmark=False

In [16]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Defining hyper-parameters
hparams = Hyper_Params()

output_size = 6 # Number of Classes
hparams.lr =  0.00005
hparams.epochs = 12
hparams.scheduler_step_size = 6
hparams.scheduler_factor = 0.2
hparams.batch_size = 64
hparams.space=50
hparams.iter_break = 5

In [17]:
# Creating a desserts class from desserts.mat
mat_data = mat73.loadmat('desserts.mat') 
train_data = torch.tensor(mat_data['XTrain'],dtype=torch.float32).permute(3,2,0,1)/255
train_labels = torch.tensor(mat_data['YTrain'])
test_data = torch.tensor(mat_data['XTest'],dtype=torch.float32).permute(3,2,0,1)/255
test_labels = torch.tensor(mat_data['YTest'])

class CustomDessertsDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image = self.data[index]
        label = self.labels[index]
        
        if self.transform:
            image = self.transform(image)
            
        return image, label-1

In [18]:
# Define a training set and print it
train_set = CustomDessertsDataset(train_data,train_labels)
print_deserts(train_set)

In [19]:
# Demonstrating that Vgg16 trained on imagnet brings nonsense results (what happens without transfer learning)
model = torchvision.models.vgg16(weights='IMAGENET1K_V1')

# Normaize dataset with IMAGENET statistics
mean_imn = [0.485, 0.456, 0.406]
std_imn = [0.229, 0.224, 0.225]
hparams.train_transform = transforms.Compose([transforms.Normalize(mean_imn,std_imn)])

# Create train and test sets
train = CustomDessertsDataset(train_data,train_labels,transform=hparams.train_transform)
test = CustomDessertsDataset(test_data,test_labels,transform=hparams.train_transform)
train_loader,test_loader = prepare_dataloaders(train,test,hparams.batch_size)

display_mislabaled(model,train_loader,mean_imn,std_imn)

FileNotFoundError: [Errno 2] No such file or directory: 'imagenet_classes.txt'

In [None]:
# Freezing the model weights, with the exeption of the last x layers
print("Learnable parameters before and afetr freeze:")
print(sum(p.numel() for p in model.parameters() if p.requires_grad))
model = layers_to_optimize(model, 2,p1=0.5,output_size=6).to(device)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

In [None]:
# Defining Criterion, optimizer, Scheduler
criterion = nn.CrossEntropyLoss()
optimizer= optim.Adam(list(p for p in model.parameters() if p.requires_grad) , lr=hparams.lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=hparams.scheduler_step_size, gamma=hparams.scheduler_factor,verbose=False)
hparams.lines = len(train_loader)

In [None]:
#Augmentations cell, as before. Set augmentations and then use them.
hparams.test_transform = transforms.Compose([transforms.Normalize(mean_imn,std_imn)])

In [None]:
# Create train and test sets
train = CustomDessertsDataset(train_data,train_labels,transform=hparams.train_transform)
test = CustomDessertsDataset(test_data,test_labels,transform=hparams.test_transform)
train_loader,test_loader = prepare_dataloaders(train,test,hparams.batch_size)

In [None]:
# Training
train_with_scheduler(hparams,train_loader,test_loader,model,optimizer,scheduler,criterion,multi_class_accuracy)
present_confusion_matrix(model,test_loader,6,{0:"canoli", 1:"moose", 2:"churros",3:"creme brule", 4:"cupcake", 5:"doughnuts" })