In [1]:
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
from torchvision import datasets, transforms
import torch.optim as optim
import torch.nn.functional as F
import datetime

In [2]:
#Download CIFAR 10 dataset for training and validation purposes and apply the following changes on each image:
# 1) make it a tensor
# 2) normalize it based on the mean and standard deviation among all pixels in each channel (RGB).
#Print the size of training and validation datasets

data_path = '../data-unversioned/p1ch7/'
# Make it a tensor
# t_cifar10 = datasets.CIFAR10(data_path, train=True, download=True, transform=transforms.ToTensor())
# t_cifar10_val = datasets.CIFAR10(data_path, train=False, download=True, transform=transforms.ToTensor())

# # # combine all images
# imgs = torch.stack([img_t for img_t, _ in t_cifar10], dim=3) # training dataset images
# imgs_val = torch.stack([img_t for img_t, _ in t_cifar10_val], dim=3) # validation dataset images

# # # Computing mean per channel = 0.4914, 0.4822, 0.4465
# imgs_mean = imgs.view(3,-1).mean(dim=1) # (0.4914, 0.4822, 0.4465),
# # # Computing std = 0.2470, 0.2435, 0.2616
# imgs_std = imgs.view(3, -1).std(dim=1) # (0.2470, 0.2435, 0.2616)
# print("imgs_mean", imgs_mean)
# print("imgs_std", imgs_std)

# # # Computing mean per channel
# imgs_val_mean = imgs_val.view(3,-1).mean(dim=1) # [0.4914, 0.4822, 0.4465]
# # # Computing std
# imgs_val_std = imgs_val.view(3, -1).std(dim=1) # [0.2470, 0.2435, 0.2616]
# print("imgs_val_mean", imgs_val_mean)
# print("imgs_val_std:", imgs_val_std)


# Normalize and transform datasets
transformed_cifar10 = datasets.CIFAR10(data_path, train=True, download=False,
                                       transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize(mean = (0.4914, 0.4822, 0.4465),
                                                                std = (0.2470, 0.2435, 0.2616))
                                           ]))

transformed_cifar10_val = datasets.CIFAR10(data_path, train=False, download=False,
                                       transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize(mean = (0.4942, 0.4851, 0.4504),
                                                                std = (0.2467, 0.2429, 0.2616))
                                           ]))


print("Size of training dataset:", len(transformed_cifar10))
print("Size of validation dataset:", len(transformed_cifar10_val))

Size of training dataset: 50000
Size of validation dataset: 10000


In [3]:
#We want to make a tertiary classifier that distinguishes between deers, dogs, and horses, labeled as 4, 5, and 7, resp.
#Create the subset training and validation datasets for this purpose.
#Print the size of these datasets.
label_map = {4:0, 5:1, 7:2}
class_names = ["deers", "dogs", "horses"]
cifar2 = [(img, label_map[label])
    for img, label in transformed_cifar10
    if label in [4, 5, 7]]
cifar2_val = [(img, label_map[label])
    for img, label in transformed_cifar10_val
    if label in [4, 5, 7]]

print("Cifar2 size", len(cifar2))
print("Cifar2_val size", len(cifar2_val))

Cifar2 size 15000
Cifar2_val size 3000


In [4]:
#Create a parameterized CNN with the following details. 
# The parameter is the number of output channels n after the first convolution.
# All kernels are of size 3 by 3.
# Convolutions must not change the height and width.
# Each convolution is followed by hyperbolic tangent as the activation function, and max pooling of size 2 by 2.
# Convolution ayers:
# 1) First convolution layer works on the input RGB input. Let's assume there are n kernels in this layer.
# 2) Second convolution layer works on the result of the preceding max pooling layer. 
#    Let's assume there are n/2 kernels in this layer.
# 3) Third convolution layer works on the result of the preceding max pooling layer. 
#    Let's assume there are n/2 kernels in this layer. 
# Fully connected layers:
# 1) First fully connected layer works on the result of the preceding max pooling layer. 
#    This layer is followed by hyperbolic tangent as its activation function.
# 2) Second fully connected layer works on the result of the preceding activation function, and emits numbers associated
#    with each class.
# We will use negative log likelihood to compute the loss. So you may add additional layer(s) to your network.
# Note: Since the network is parameterized (n), you'd rather define the CNN as a subclass of nn.Module.
import torch.nn.functional as F

# padding: keep the output image size the same
class Net(nn.Module):
    # n = number of output channels n after the first convolution
    def __init__(self, n):
        super().__init__()
        self.conv1 = nn.Conv2d(3, n, kernel_size=3, padding=1) # 3x3, 3 Channels -> n Channels
        self.conv2 = nn.Conv2d(n, n//2, kernel_size=3, padding=1) # 3x3, n C -> n/2 C
        self.conv3 = nn.Conv2d(n//2, n//2, kernel_size=3, padding=1) # 3x3, n/2 C -> n/2 C
        
        # turn multichannel 2D features into 1D vector
        self.ch = 4 * 4 * (n//2) # W x H x num of kernels
        self.fc1 = nn.Linear(self.ch, 32)
        self.fc2 = nn.Linear(32, 3)
        
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2) # 32 x 32 x n -> kernel is 3
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2) # 16 x 16 x n/2
        out = F.max_pool2d(torch.tanh(self.conv3(out)), 2) # 8 x 8 x n/2
        out = out.view(-1, self.ch) # 4 x 4 x n/4 # call view to turn it into B x N vector
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        out = F.log_softmax(out, dim=1)
        return out

In [16]:
#Create two networks as instances of the CNN you defined above, with n = 16 and n = 32 respectively. 
#Print the total number of parameters in each of these instances.
model = Net(16)
model2 = Net(32)
# model = Net()

numel_list_1 = [p.numel() for p in model.parameters()]
numel_list_2 = [p.numel() for p in model2.parameters()]
print("Total num of parameters in CNN with n = 16:", sum(numel_list_1)) 
print("Total num of parameters in CNN with n = 32:", sum(numel_list_2))

Total num of parameters in CNN with n = 16: 6419
Total num of parameters in CNN with n = 32: 16163


In [6]:
#Our training functionality is supposed to compute gradient on batches of training data, randlomy selected each time.
#To this end, create a training data loader with batch size 32 that randomizes access to each batch.
#Also, create a validation data loader with the same batch size that does not randomize access to each batch (no need!)
#Print the number of batches in training and validation data loaders

train_loader = torch.utils.data.DataLoader(cifar2, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=32, shuffle=False)

print("Number of batches:")
print("- Training data loader: ", len(train_loader))
print("- Validation data loader: ", len(val_loader))

Number of batches:
- Training data loader:  469
- Validation data loader:  94


In [7]:
#Define your training function that receives the training loader, model, loss function, optimizer, the device (cpu/gpu), and 
# number of epochs.
#In each epoch, you should go through each training data batch, and:
# 1) move data to device
# 1) compute the output batch, and accordingly the loss
# 2) compute the gradient of loss wrt parameters, and update the parameters
#After covering all epochs, your training function must report the training accuracy

def training_loop(train_loader, model, loss_fn, optimizer, device, n_epochs):
    for epoch in range(1, n_epochs+1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            # move data to device
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)
            
            outputs = model(imgs) # compute output batch
            loss = loss_fn(outputs, labels) # compute loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_train += loss.item()
            
        if epoch == 1 or epoch % 10 == 0:
            print('{} Epoch {}, Training loss {}'.format(datetime.datetime.now(), epoch, loss_train / len(train_loader)))
        

In [8]:
#Define a separate function that receives the validation data loader as well as the model and computes the validation 
# accuracy of the model.

def validate(model, train_loader, val_loader):
    for name, loader in [("train", train_loader), ("val", val_loader)]:
        correct = 0
        total = 0
        
        with torch.no_grad():
            for imgs, labels in loader:
                imgs = imgs.to(device=device)
                labels = labels.to(device=device)
                outputs = model(imgs)
                _, predicted = torch.max(outputs, dim=1)
                total += labels.shape[0]
                correct += int((predicted == labels).sum())
        print("Accuracy {}: {:.6f}".format(name , correct / total))

In [9]:
#Define device dynamically based on whether CUDA is available or not.
#Call the training function on the created training data loader, the created CNN  with n = 16, 
# negative log likelihood loss function, stochastic gradient descent optimizer,
# the device you defined, and 100 epochs. Next, call validation accuracy function.
#Is the model overfit? (Yes/No) Why?

# move model to specified device
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print(f"Training on device {device}.")
model = Net(16).to((device))

learning_rate = 1e-2
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
n_epochs=100
loss_fn = nn.NLLLoss()

training_loop(n_epochs=n_epochs, train_loader=train_loader, model=model, loss_fn=loss_fn, optimizer=optimizer, device=device)

print("Training and Validation Accuracy of model = Net(16)")
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64,
shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64,
shuffle=False)
validate(model=model, train_loader=train_loader, val_loader=val_loader)

print('''\nThe model is slightly overfit as the training accuracy is higher than the validation accuracy.
Overfitting happens when the model is memorizing the training data but it is not able to generalize results as well as
for new data.
This CNN model has shown to be more effective to generalize compared to the model in the previous homework
given that the difference of accuracies of this model is around 8%, while the model in HW7 had a difference
of almost 30% between training and validation accuracies.''')

Training on device cuda.
2022-10-28 21:43:22.727780 Epoch 1, Training loss 1.0687145461151595
2022-10-28 21:43:31.308988 Epoch 10, Training loss 0.6519320540463747
2022-10-28 21:43:40.754543 Epoch 20, Training loss 0.5460886877101622
2022-10-28 21:43:50.788713 Epoch 30, Training loss 0.4842876726503311
2022-10-28 21:44:02.468107 Epoch 40, Training loss 0.44314215120984546
2022-10-28 21:44:12.191828 Epoch 50, Training loss 0.4153198055557605
2022-10-28 21:44:22.152500 Epoch 60, Training loss 0.3929845809237535
2022-10-28 21:44:32.247569 Epoch 70, Training loss 0.37235292569914863
2022-10-28 21:44:42.367197 Epoch 80, Training loss 0.352102104105802
2022-10-28 21:44:53.637983 Epoch 90, Training loss 0.33404847685652755
2022-10-28 21:45:04.279127 Epoch 100, Training loss 0.31983378202295
Training and Validation Accuracy of model = Net(16)
Accuracy train: 0.882000
Accuracy val: 0.796667

The model is slightly overfit as the training accuracy is higher than the validation accuracy.
Overfitti

In [10]:
#Call the training function on the created training data loader, the created CNN  with n = 32, 
# negative log likelihood loss function, stochastic gradient descent optimizer,
# the device you defined, and 100 epochs. Next, call validation accuracy function.
#Is the model overfit? (Yes/No) Why? 
# (This can be compared to the fully connected network we created in the last set of exercises.)

# move model to specified device
print(f"Training on device {device}.")
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
model2 = model2.to(device=device)

learning_rate = 1e-2
optimizer2 = optim.SGD(model2.parameters(), lr=learning_rate)
n_epochs=100
loss_fn = nn.NLLLoss()


training_loop(n_epochs=n_epochs, train_loader=train_loader, model=model2, loss_fn=loss_fn, optimizer=optimizer2, device=device)

print("Training and Validation Accuracy of model2 = Net(32)")
validate(model=model2, train_loader=train_loader, val_loader=val_loader)

print("\nThe model is overfit since the training accuracy is higher than the validation accuracy.")

Training on device cuda.
2022-10-28 21:45:05.186636 Epoch 1, Training loss 1.0865204055258568
2022-10-28 21:45:10.883717 Epoch 10, Training loss 0.7063407745767147
2022-10-28 21:45:17.081244 Epoch 20, Training loss 0.5551923820312987
2022-10-28 21:45:23.422732 Epoch 30, Training loss 0.4609005456275128
2022-10-28 21:45:29.783699 Epoch 40, Training loss 0.40060229548748505
2022-10-28 21:45:36.157002 Epoch 50, Training loss 0.35617608979661414
2022-10-28 21:45:42.544285 Epoch 60, Training loss 0.31755396476451386
2022-10-28 21:45:48.914805 Epoch 70, Training loss 0.2826795511106227
2022-10-28 21:45:56.036558 Epoch 80, Training loss 0.2502235442082933
2022-10-28 21:46:02.275982 Epoch 90, Training loss 0.21977652301813694
2022-10-28 21:46:08.653578 Epoch 100, Training loss 0.19207787412278196
Training and Validation Accuracy of model2 = Net(32)
Accuracy train: 0.930733
Accuracy val: 0.814667

The model is overfit since the training accuracy is higher than the validation accuracy.


In [11]:
#Next, let's consider L2 regularization with weight decay 0.002 for CNN with n = 32. 
# Is the model overfit? (Yes/No) Why?
# optimizer accepts weight decay as input
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)

model3 = Net(32).to(device=device)
optimizer3 = optim.SGD(model3.parameters(), lr=learning_rate, weight_decay=0.002)
training_loop(n_epochs=n_epochs, train_loader=train_loader, model=model3, loss_fn=loss_fn, optimizer=optimizer3, device=device)
validate(model=model3, train_loader=train_loader, val_loader=val_loader)
print('''The model is overfit since the training accuracy is higher than the validation accuracy.''')

2022-10-28 21:46:09.618247 Epoch 1, Training loss 1.084126143252596
2022-10-28 21:46:15.363517 Epoch 10, Training loss 0.721956385703797
2022-10-28 21:46:21.729078 Epoch 20, Training loss 0.5965765701963547
2022-10-28 21:46:27.899219 Epoch 30, Training loss 0.51014675003417
2022-10-28 21:46:34.245993 Epoch 40, Training loss 0.4431341862424891
2022-10-28 21:46:40.518281 Epoch 50, Training loss 0.3972970224441366
2022-10-28 21:46:46.870352 Epoch 60, Training loss 0.36258569752916375
2022-10-28 21:46:54.128976 Epoch 70, Training loss 0.3320204309326537
2022-10-28 21:47:00.857053 Epoch 80, Training loss 0.30535245575803394
2022-10-28 21:47:07.273492 Epoch 90, Training loss 0.28116863439691825
2022-10-28 21:47:13.565209 Epoch 100, Training loss 0.2592972617834172
Accuracy train: 0.907467
Accuracy val: 0.821667
The model is overfit since the training accuracy is higher than the validation accuracy.


In [12]:
#Add a skip connection in your CNN from the output of second max pooling to the input of 3rd max pooling.
#Train the updated CNN with the same parameters including (n = 32).
#Is the model overfit? (Yes/No) Why?

# Version of previous Net() CNN using skip connections a la ResNet
class ResNet(nn.Module):
    # n = number of output channels n after the first convolution
    def __init__(self, n):
        super().__init__()
        self.conv1 = nn.Conv2d(3, n, kernel_size=3, padding=1) # 3x3, 3 Channels -> n Channels
        self.conv2 = nn.Conv2d(n, n//2, kernel_size=3, padding=1) # 3x3, n C -> n/2 C
        self.conv3 = nn.Conv2d(n//2, n//2, kernel_size=3, padding=1) # 3x3, n/2 C -> n/4 C
        
        # turn multichannel 2D features into 1D vector
        self.ch = 4 * 4 * (n//2) # W x H x num of kernels
        self.fc1 = nn.Linear(self.ch, 32) # n/4 x n/4 x n/4
        self.fc2 = nn.Linear(32, 3)
        
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2) # 32 x 32 x n -> kernel is 3
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2) # 16 x 16 x n
        out1 = out
        out = F.max_pool2d(torch.tanh(self.conv3(out)) + out1, 2) # Skip Connection
        out = out.view(-1, self.ch) # n/4 * n/4 * n/4 # call view to turn it into B x N vector
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        out = F.log_softmax(out, dim=1)
        return out

# Train model with the same parameters (n=32)
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)
model4 = ResNet(32).to(device=device)
optimizer4 = optim.SGD(model4.parameters(), lr=learning_rate, weight_decay=0.002)
training_loop(
    n_epochs=n_epochs,
    train_loader=train_loader,
    model=model4,
    loss_fn=loss_fn,
    optimizer=optimizer4,
    device=device)


validate(model=model4, train_loader=train_loader, val_loader=val_loader)
print('''since the training accuracy is higher than the validation accuracy.''')

2022-10-28 21:47:14.522744 Epoch 1, Training loss 1.0447596453605814
2022-10-28 21:47:20.213905 Epoch 10, Training loss 0.6446497010423782
2022-10-28 21:47:26.672977 Epoch 20, Training loss 0.5031503773750143
2022-10-28 21:47:32.910465 Epoch 30, Training loss 0.4266185052217321
2022-10-28 21:47:39.225668 Epoch 40, Training loss 0.37924097383275945
2022-10-28 21:47:45.519182 Epoch 50, Training loss 0.3437925416738429
2022-10-28 21:47:52.618466 Epoch 60, Training loss 0.3143698365764415
2022-10-28 21:47:59.796633 Epoch 70, Training loss 0.2878950150723153
2022-10-28 21:48:06.311784 Epoch 80, Training loss 0.26431462878876544
2022-10-28 21:48:12.844013 Epoch 90, Training loss 0.24269612193741696
2022-10-28 21:48:19.263775 Epoch 100, Training loss 0.22147898173078578
Accuracy train: 0.923800
Accuracy val: 0.822333
since the training accuracy is higher than the validation accuracy.


In [13]:
# Train model with the same parameters (n=32)
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)

model5 = ResNet(32).to(device=device)
optimizer5 = optim.SGD(model5.parameters(), lr=learning_rate, weight_decay=0.002)
training_loop(
    n_epochs=n_epochs,
    train_loader=train_loader,
    model=model5,
    loss_fn=loss_fn,
    optimizer=optimizer5,
    device=device
)


validate(model=model5, train_loader=train_loader, val_loader=val_loader)

2022-10-28 21:48:20.230913 Epoch 1, Training loss 1.0636494948508892
2022-10-28 21:48:25.925853 Epoch 10, Training loss 0.6373084157071215
2022-10-28 21:48:32.306407 Epoch 20, Training loss 0.5080725383251271
2022-10-28 21:48:38.540143 Epoch 30, Training loss 0.43420869784152255
2022-10-28 21:48:44.991761 Epoch 40, Training loss 0.38446929714781175
2022-10-28 21:48:51.319364 Epoch 50, Training loss 0.34611960801672426
2022-10-28 21:48:57.610758 Epoch 60, Training loss 0.31461222431761154
2022-10-28 21:49:03.948139 Epoch 70, Training loss 0.287566935128354
2022-10-28 21:49:10.089911 Epoch 80, Training loss 0.2628550377614955
2022-10-28 21:49:16.435296 Epoch 90, Training loss 0.24069075454422767
2022-10-28 21:49:22.791518 Epoch 100, Training loss 0.22019156496575537
Accuracy train: 0.911600
Accuracy val: 0.822000


In [14]:
#Consider dropout layers after each max pooling in the original CNN, where the probability of zeroing output features is 30%.
#Train the updated CNN with the same parameters including (n = 32).
#Is the model overfit? (Yes/No) Why?

# padding: keep the output image size the same
class NetDropout(nn.Module):
    # n = number of output channels n after the first convolution
    def __init__(self, n):
        super().__init__()
        self.conv1 = nn.Conv2d(3, n, kernel_size=3, padding=1) # 3x3, 3 Channels -> n Channels
        self.conv1_dropout = nn.Dropout2d(p=0.3)
        self.conv2 = nn.Conv2d(n, n//2, kernel_size=3, padding=1) # 3x3, n C -> n/2 C
        self.conv2_dropout = nn.Dropout2d(p=0.3)
        self.conv3 = nn.Conv2d(n//2, n//2, kernel_size=3, padding=1) # 3x3, n/2 C -> n/4 C
        self.conv3_dropout = nn.Dropout2d(p=0.3)
        
        # turn multichannel 2D features into 1D vector
        self.ch = 4 * 4 * (n//2) # W x H x num of kernels
        self.fc1 = nn.Linear(self.ch, 32) # n/4 x n/4 x n/4
        self.fc2 = nn.Linear(32, 3)
        
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2) # 32 x 32 x n -> kernel is 3
        out = self.conv1_dropout(out)
        
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2) # 16 x 16 x n
        out = self.conv2_dropout(out)
        
        out = F.max_pool2d(torch.tanh(self.conv3(out)), 2) # 8 x 8 x n
        out = self.conv3_dropout(out)
        
        out = out.view(-1, self.ch) # n/4 * n/4 * n/4 # call view to turn it into B x N vector
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        out = F.log_softmax(out, dim=1)
        
        return out
    
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)

model6 = NetDropout(32).to(device=device)
optimizer6 = optim.SGD(model6.parameters(), lr=learning_rate)
training_loop(
    n_epochs=n_epochs,
    train_loader=train_loader,
    model=model6,
    loss_fn=loss_fn,
    optimizer=optimizer6,
    device=device)


validate(model=model6, train_loader=train_loader, val_loader=val_loader)
print('''The difference between the training and validation accuracy is around 1 percent, so the model is slightly
overfit, but well fit compared to the previous CNNs.''')

2022-10-28 21:49:23.777695 Epoch 1, Training loss 1.0902992547826564
2022-10-28 21:49:29.866585 Epoch 10, Training loss 0.8505581247045638
2022-10-28 21:49:36.695218 Epoch 20, Training loss 0.749748404228941
2022-10-28 21:49:43.625868 Epoch 30, Training loss 0.6906673886674516
2022-10-28 21:49:50.582686 Epoch 40, Training loss 0.6562702217000596
2022-10-28 21:49:58.317441 Epoch 50, Training loss 0.6304895234868881
2022-10-28 21:50:05.201773 Epoch 60, Training loss 0.601687741279602
2022-10-28 21:50:12.047203 Epoch 70, Training loss 0.581660072981043
2022-10-28 21:50:18.917601 Epoch 80, Training loss 0.5793321301328375
2022-10-28 21:50:25.740500 Epoch 90, Training loss 0.5601626330233634
2022-10-28 21:50:32.584002 Epoch 100, Training loss 0.5537662220762131
Accuracy train: 0.781533
Accuracy val: 0.751000
The difference between the training and validation accuracy is around 1 percent, so the model is slightly
overfit, but well fit compared to the previous CNNs.


In [15]:
#Considering all the modifications which one works better? Plain CNN, CNN+L2, CNN+Skip, CNN+Dropout?
print('''The CNN that worked better was the CNN+Skip layer in terms of highest training and validation accuracies with
the lowest overfitting. However, if we are looking for a well fit CNN then the CNN+Dropout performed best since it had around
1% difference between the training and validation data.''')


The CNN that worked better was the CNN+Skip layer in terms of highest training and validation accuracies with
the lowest overfitting. However, if we are looking for a well fit CNN then the CNN+Dropout performed best since it had around
1% difference between the training and validation data.
