In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm

import numpy as np

torch.manual_seed(95)

<torch._C.Generator at 0x15f0a399f10>

In [2]:
train_dataset = torchvision.datasets.CIFAR10(root='materials/data/',
                                             train=True,  
                                             transform=transforms.ToTensor(), 
                                             download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100, 
                                           shuffle=True)

test_dataset = torchvision.datasets.CIFAR10(root='./data', 
                                            train=False,
                                            download=True, 
                                            transform=transforms.ToTensor())
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=64,
                                          shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [125]:
class Perceptron(nn.Module):
    """ 
    The basic perceptron. 
    A fully connected layer, which takes input, multiplies it by the weights matrix, 
    adds bias and uses an activation function.
    """
    def __init__(
        self, 
        input_dim: int, 
        output_dim: int, 
        bias: bool = True,
        activation: callable = torch.sigmoid
    ):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.bias = bias
        self.activation_func = activation
        # Use the class method to initialize the weights.
        self.fc = self._init_weights(nn.Linear(input_dim, output_dim, bias=bias))        
        
    def _init_weights(self, layer):
        """
        I decided to initialize weights with the Xavier's algorithm here, 
        since we are using the Sigmoid activation function
        
        In the case of ReLU I would switch the initializer to the "Kaiman's" a.k.a "He".
        """
        layer.weight.data = nn.init.xavier_uniform_(
            layer.weight.data, 
            gain=nn.init.calculate_gain(self.activation_func.__name__)
        )
        return layer
        
    def forward(self, input):
        fc_out = self.fc(input)
        output = self.activation_func(fc_out)
        return output
        

class SimpleLinearModel(nn.Module):
    """
    A very simple model with 4 hidden layers.
    Each layer is a Perceptron with different number of units.
    """
    def __init__(self, input_shape, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = Perceptron(input_shape, hidden_dim * 5)
        self.fc2 = Perceptron(hidden_dim * 5, hidden_dim * 4)
        self.fc3 = Perceptron(hidden_dim * 4, hidden_dim * 3)
        self.fc4 = Perceptron(hidden_dim * 3, hidden_dim * 2)
        self.fc5 = Perceptron(hidden_dim * 2, hidden_dim)
        self.fc6 = Perceptron(hidden_dim, output_dim)
        
    def forward(self, input):
        # As far as the model is going to take various images as the input:
        # input.shape -> (n_images, n_channels, width, height) 
        # We have to flatten the incoming parts of the tensor, which are actually the images. 
        input = torch.flatten(input, start_dim=1)        
        # input = input.view(input.shape[0], -1)
        layer_1 = self.fc1.forward(input)
        layer_2 = self.fc2.forward(layer_1)
        layer_3 = self.fc3.forward(layer_2)
        layer_4 = self.fc4.forward(layer_3)
        layer_5 = self.fc5.forward(layer_4)
        layer_6 = self.fc6.forward(layer_5)
        return layer_6

In [30]:
# Test how the torch function works
lin = nn.Linear(10, 2, bias=True)
torch.nn.init.xavier_uniform_(lin.weight.data, gain=nn.init.calculate_gain('sigmoid'))

tensor([[ 0.5124, -0.0262,  0.5933,  0.6523,  0.2227, -0.1314,  0.2060,  0.6567,
         -0.2944,  0.0543],
        [ 0.2842, -0.2615,  0.0070, -0.3143, -0.5217,  0.5771, -0.2221, -0.0046,
          0.5049,  0.4867]])

In [31]:
# Test how the torch function works
lin = nn.Linear(4, 2, bias=True)
weights = torch.nn.init.kaiming_normal_(lin.weight.data)

print(f'Initialized weights:\n{weights}\n\nStandard Deviation:\n{weights.std():.3f}')

Initialized weights:
tensor([[ 0.0060, -0.0932,  0.9977,  0.4943],
        [-0.7330,  1.0627,  0.7955, -0.0533]])

Standard Deviation:
0.631


In [120]:
# Initialize the model, the loss and the optimizer.
model = SimpleLinearModel(3072, 2, 10)
model.train()
criterion = nn.CrossEntropyLoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [121]:
for param in model.parameters():
    print(param.data)

tensor([[ 0.0302, -0.0117,  0.0426,  ..., -0.0244,  0.0253,  0.0214],
        [ 0.0047,  0.0376,  0.0014,  ...,  0.0277,  0.0013, -0.0027],
        [-0.0272,  0.0380,  0.0402,  ..., -0.0210, -0.0312, -0.0022],
        ...,
        [ 0.0308,  0.0068, -0.0423,  ...,  0.0277, -0.0342, -0.0340],
        [ 0.0299, -0.0118,  0.0103,  ...,  0.0123,  0.0302,  0.0278],
        [ 0.0398,  0.0367,  0.0303,  ..., -0.0111, -0.0050, -0.0173]])
tensor([-0.0003, -0.0099, -0.0041,  0.0140, -0.0152, -0.0067,  0.0128, -0.0176,
         0.0055,  0.0006])
tensor([[-0.1157,  0.1383, -0.1043, -0.4139, -0.5422,  0.4862, -0.1357,  0.0311,
          0.4555,  0.1627],
        [-0.4905, -0.4141, -0.0296, -0.2748, -0.3856,  0.4967, -0.1463,  0.2099,
         -0.2512, -0.3669],
        [ 0.0922, -0.4889,  0.0728,  0.4196, -0.2611,  0.4141,  0.0128,  0.3254,
         -0.2208,  0.2289],
        [-0.3754,  0.5252, -0.5391,  0.1381, -0.4574,  0.2102,  0.2640, -0.2687,
         -0.3638,  0.5699],
        [-0.5668,  0.45

In [67]:
# Let's take a few sample images
image1, label1 = train_dataset[0]
image2, label2 = train_dataset[1]
image3, label3 = train_dataset[2]
image.shape, label

(torch.Size([3, 32, 32]), 6)

In [97]:
# Making just one back-propagation step to test the model
inputs = torch.cat((image1[None, :], image2[None, :], image3[None, :]))
labels = torch.tensor([label1, label2, label3])
outputs = model(inputs)
loss = criterion(outputs, labels.view(-1))
loss.sum().backward()

In [98]:
# Checking the softmax predictions
outputs_sum = outputs.sum()
print(f'predictions:\n{outputs}\n\nsum of predictions validity:\n{outputs_sum:.1f}')

predictions:
tensor([[0.1040, 0.1022, 0.1153, 0.0936, 0.1084, 0.0914, 0.0857, 0.0864, 0.1077,
         0.1052],
        [0.1039, 0.1022, 0.1153, 0.0935, 0.1083, 0.0915, 0.0857, 0.0865, 0.1078,
         0.1052],
        [0.1038, 0.1021, 0.1154, 0.0934, 0.1083, 0.0916, 0.0858, 0.0865, 0.1079,
         0.1052]], grad_fn=<SoftmaxBackward0>)

sum of predictions validity:
3.0


In [101]:
# Checking the Cross Entropy Loss 
outputted_softmax_prediction_values = []
manual_losses = []
for i, sample in enumerate(outputs):
    outputted_softmax_prediction_values.append(round(float(sample[labels[i]]), 5))
    manual_losses.append(round(-np.log(softmax_value), 5))
    
print(f'Outputted softmax value:\n{outputted_softmax_prediction_values}\n\nLoss (-log(predict)):\n{manual_losses}\n\nCalculated Torch Loss:\n{loss}')

Outputted softmax value:
[0.0857, 0.10524, 0.10523]

Loss (-log(predict)):
[2.18179, 2.18179, 2.18179]

Calculated Torch Loss:
tensor([2.3169, 2.2974, 2.2974], grad_fn=<NllLossBackward0>)


In [102]:
# Check that the gradients were calculated
for param in model.parameters():
    print(param.grad)

tensor([[ 2.9453e-05,  2.8776e-05,  2.6059e-05,  ...,  2.5201e-06,
          9.6958e-06,  1.1444e-05],
        [ 1.3771e-05,  1.3440e-05,  1.2171e-05,  ...,  1.2703e-06,
          4.5900e-06,  5.3997e-06],
        [ 1.3196e-05,  1.1626e-05,  1.1353e-05,  ...,  1.2328e-05,
          9.8528e-06,  9.3848e-06],
        ...,
        [ 2.8472e-06,  3.0299e-06,  2.5371e-06,  ..., -2.1129e-06,
         -1.6859e-07,  2.7622e-07],
        [ 6.2058e-05,  5.8297e-05,  5.4431e-05,  ...,  2.6368e-05,
          3.0647e-05,  3.1944e-05],
        [ 1.2753e-05,  1.2280e-05,  1.1081e-05,  ...,  2.1173e-06,
          4.8875e-06,  5.5732e-06]])
tensor([ 1.4175e-05,  6.7701e-06,  2.5010e-05,  2.4019e-05, -2.0729e-05,
        -2.6079e-06,  6.5089e-05,  7.6672e-06])
tensor([[ 8.1050e-05,  1.0620e-04,  4.8572e-05,  7.0846e-05,  9.8121e-05,
          6.6954e-05,  8.0300e-05,  1.0778e-04],
        [ 5.3314e-05,  7.8954e-05,  6.4523e-05,  4.1612e-05,  8.4868e-05,
          3.5887e-05,  8.5015e-05,  7.7863e-05],
 

In [141]:
# Now let's train the model for a few epochs using the visualization library tqdm 
# and the train_loader, which allows us to train the model each epoch on a new random batch.

hidden_layer_dim = 100
model = SimpleLinearModel(3072, hidden_layer_dim, 10)
model.train()
criterion = nn.CrossEntropyLoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 10
for epoch in tqdm(range(num_epochs)):  
    current_loss = 0.0
    running_items = 0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.sum().backward()
        optimizer.step()

        current_loss += loss.mean().item()
        running_items += len(labels)
        if i % 100 == 0:
            print(f'[Epoch {epoch + 1}, batch {i + 1}] loss: {current_loss / (running_items / 100):.4f}')

print('Training is finished!')

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

[Epoch 1, batch 1] loss: 2.3480
[Epoch 1, batch 101] loss: 2.3091
[Epoch 1, batch 201] loss: 2.3020
[Epoch 1, batch 301] loss: 2.2855
[Epoch 1, batch 401] loss: 2.2666


 10%|████████▎                                                                          | 1/10 [00:47<07:08, 47.62s/it]

[Epoch 2, batch 1] loss: 2.1689
[Epoch 2, batch 101] loss: 2.1680
[Epoch 2, batch 201] loss: 2.1603
[Epoch 2, batch 301] loss: 2.1556
[Epoch 2, batch 401] loss: 2.1514


 20%|████████████████▌                                                                  | 2/10 [01:30<05:56, 44.60s/it]

[Epoch 3, batch 1] loss: 2.1694
[Epoch 3, batch 101] loss: 2.1301
[Epoch 3, batch 201] loss: 2.1261
[Epoch 3, batch 301] loss: 2.1236
[Epoch 3, batch 401] loss: 2.1226


 30%|████████████████████████▉                                                          | 3/10 [01:44<03:35, 30.85s/it]

[Epoch 4, batch 1] loss: 2.0786
[Epoch 4, batch 101] loss: 2.1118
[Epoch 4, batch 201] loss: 2.1099
[Epoch 4, batch 301] loss: 2.1088
[Epoch 4, batch 401] loss: 2.1074


 40%|█████████████████████████████████▏                                                 | 4/10 [01:58<02:24, 24.15s/it]

[Epoch 5, batch 1] loss: 2.1582
[Epoch 5, batch 101] loss: 2.0957
[Epoch 5, batch 201] loss: 2.0959
[Epoch 5, batch 301] loss: 2.0959
[Epoch 5, batch 401] loss: 2.0952


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:12<01:42, 20.48s/it]

[Epoch 6, batch 1] loss: 2.1444
[Epoch 6, batch 101] loss: 2.0872
[Epoch 6, batch 201] loss: 2.0879
[Epoch 6, batch 301] loss: 2.0890
[Epoch 6, batch 401] loss: 2.0888


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [02:28<01:15, 18.85s/it]

[Epoch 7, batch 1] loss: 2.0483
[Epoch 7, batch 101] loss: 2.0836
[Epoch 7, batch 201] loss: 2.0822
[Epoch 7, batch 301] loss: 2.0835
[Epoch 7, batch 401] loss: 2.0830


 70%|██████████████████████████████████████████████████████████                         | 7/10 [02:41<00:51, 17.16s/it]

[Epoch 8, batch 1] loss: 2.1275
[Epoch 8, batch 101] loss: 2.0782
[Epoch 8, batch 201] loss: 2.0777
[Epoch 8, batch 301] loss: 2.0773
[Epoch 8, batch 401] loss: 2.0785


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [02:56<00:32, 16.48s/it]

[Epoch 9, batch 1] loss: 2.0879
[Epoch 9, batch 101] loss: 2.0732
[Epoch 9, batch 201] loss: 2.0753
[Epoch 9, batch 301] loss: 2.0756
[Epoch 9, batch 401] loss: 2.0763


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [03:11<00:15, 15.88s/it]

[Epoch 10, batch 1] loss: 2.0590
[Epoch 10, batch 101] loss: 2.0731
[Epoch 10, batch 201] loss: 2.0704
[Epoch 10, batch 301] loss: 2.0719
[Epoch 10, batch 401] loss: 2.0714


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:25<00:00, 20.51s/it]

Training is finished!





In [142]:
optimizer = optim.Adam(model.parameters(), lr=0.00001)

num_epochs = 10
for epoch in tqdm(range(num_epochs)):  
    current_loss = 0.0
    running_items = 0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.sum().backward()
        optimizer.step()

        current_loss += loss.mean().item()
        running_items += len(labels)
        if i % 100 == 0:
            print(f'[Epoch {epoch + 1}, batch {i + 1}] loss: {current_loss / (running_items / 100):.4f}')

print('Training is finished!')

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

[Epoch 1, batch 1] loss: 2.1306
[Epoch 1, batch 101] loss: 2.0673
[Epoch 1, batch 201] loss: 2.0649
[Epoch 1, batch 301] loss: 2.0641
[Epoch 1, batch 401] loss: 2.0636


 10%|████████▎                                                                          | 1/10 [00:14<02:09, 14.44s/it]

[Epoch 2, batch 1] loss: 2.0675
[Epoch 2, batch 101] loss: 2.0633
[Epoch 2, batch 201] loss: 2.0642
[Epoch 2, batch 301] loss: 2.0639
[Epoch 2, batch 401] loss: 2.0631


 20%|████████████████▌                                                                  | 2/10 [00:28<01:52, 14.05s/it]

[Epoch 3, batch 1] loss: 2.0581
[Epoch 3, batch 101] loss: 2.0700
[Epoch 3, batch 201] loss: 2.0652
[Epoch 3, batch 301] loss: 2.0643
[Epoch 3, batch 401] loss: 2.0620


 30%|████████████████████████▉                                                          | 3/10 [00:42<01:37, 13.97s/it]

[Epoch 4, batch 1] loss: 2.0728
[Epoch 4, batch 101] loss: 2.0579
[Epoch 4, batch 201] loss: 2.0585
[Epoch 4, batch 301] loss: 2.0600
[Epoch 4, batch 401] loss: 2.0605


 40%|█████████████████████████████████▏                                                 | 4/10 [00:55<01:23, 13.87s/it]

[Epoch 5, batch 1] loss: 2.0754
[Epoch 5, batch 101] loss: 2.0670
[Epoch 5, batch 201] loss: 2.0613
[Epoch 5, batch 301] loss: 2.0611
[Epoch 5, batch 401] loss: 2.0606


 50%|█████████████████████████████████████████▌                                         | 5/10 [01:09<01:09, 13.87s/it]

[Epoch 6, batch 1] loss: 2.0664
[Epoch 6, batch 101] loss: 2.0612
[Epoch 6, batch 201] loss: 2.0599
[Epoch 6, batch 301] loss: 2.0602
[Epoch 6, batch 401] loss: 2.0592


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [01:23<00:55, 13.94s/it]

[Epoch 7, batch 1] loss: 2.0436
[Epoch 7, batch 101] loss: 2.0610
[Epoch 7, batch 201] loss: 2.0590
[Epoch 7, batch 301] loss: 2.0581
[Epoch 7, batch 401] loss: 2.0581


 70%|██████████████████████████████████████████████████████████                         | 7/10 [01:38<00:42, 14.04s/it]

[Epoch 8, batch 1] loss: 2.0642
[Epoch 8, batch 101] loss: 2.0612
[Epoch 8, batch 201] loss: 2.0601
[Epoch 8, batch 301] loss: 2.0590
[Epoch 8, batch 401] loss: 2.0575


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [01:54<00:29, 14.91s/it]

[Epoch 9, batch 1] loss: 2.1084
[Epoch 9, batch 101] loss: 2.0537
[Epoch 9, batch 201] loss: 2.0547
[Epoch 9, batch 301] loss: 2.0567
[Epoch 9, batch 401] loss: 2.0574


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [02:10<00:15, 15.24s/it]

[Epoch 10, batch 1] loss: 2.0360
[Epoch 10, batch 101] loss: 2.0549
[Epoch 10, batch 201] loss: 2.0535
[Epoch 10, batch 301] loss: 2.0550
[Epoch 10, batch 401] loss: 2.0557


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [02:24<00:00, 14.47s/it]

Training is finished!





The score isn't good one here, because it wasn't allowed to use CNNs in this homework. I could also change the score to MSE, it should probably work better with sigmoid activations. And changing the activations to ReLU could also make the model better. 

However, the key point here was that the model trains at all :)