## PyTorch exercises

### Tensors

1. Make a tensor of size (2, 17)
2. Make a torch.FloatTensor of size (3, 1)
3. Make a torch.LongTensor of size (5, 2, 1)
  - fill the entire tensor with 7s
4. Make a torch.ByteTensor of size (5,)
  - fill the middle 3 indices with ones such that it records [0, 1, 1, 1, 0]
5. Perform a matrix multiplication of two tensors of size (2, 4) and (4, 2). Then do it in-place.
6. Do element-wise multiplication of two randomly filled $(n_1,n_2,n_3)$ tensors. Then store the result in an Numpy array.

### Forward-prop/backward-prop
1. Create a Tensor that `requires_grad` of size (5, 5).
2. Sum the values in the Tensor.
3. Multiply the tensor by 2 and assign the result to a new python variable (i.e. `x = result`)
4. Sum the variable's elements and assign to a new python variable
5. Print the gradients of all the variables
6. Now perform a backward pass on the last variable (NOTE: for each new python variable that you define, call `.retain_grad()`)
7. Print all gradients again

### Deep-forward NNs
1. Use dl_lab2. In Exercise 12 there, you had to build an $L$-layer neural network with the following structure: *[LINEAR -> RELU]$\times$(L-1) -> LINEAR -> SIGMOID*. Reimplement the manual code in PyTorch.
2. Compare test accuracy using different optimizers: SGD, Adam, Momentum.

In [26]:
import torch
tensor1 = torch.zeros(2, 17)
tensor2 = torch.rand(3, 1, dtype=torch.float)
tensor3 = torch.LongTensor(5,2,1)
print(tensor1,'\n', tensor2.dtype,'\n', tensor3.shape)
tensor3.fill_(7)
print(tensor3)
tensor4 = torch.ByteTensor(5,)
tensor4[0]=0
tensor4[1:4]=1
print(tensor4)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]) 
 torch.float32 
 torch.Size([5, 2, 1])
tensor([[[7],
         [7]],

        [[7],
         [7]],

        [[7],
         [7]],

        [[7],
         [7]],

        [[7],
         [7]]])
tensor([  0,   1,   1,   1, 205], dtype=torch.uint8)


In [29]:
tensorA = torch.rand(2, 4)
tensorB = torch.rand(4, 2)
res = torch.matmul(tensorA, tensorB)
print(res)
res = torch.empty(2, 2)
torch.matmul(tensorA, tensorB, out = res)
print(res)

tensor([[1.1067, 0.9209],
        [1.1642, 1.2372]])
tensor([[1.1067, 0.9209],
        [1.1642, 1.2372]])


In [37]:
tensorA = torch.rand(3,1,4)
tensorB = torch.rand(1,3,4)
res = torch.mul(tensorA, tensorB)
res = res.numpy()
res

array([[[0.00713706, 0.6502601 , 0.03834382, 0.4092991 ],
        [0.01113113, 0.47165358, 0.14229806, 0.28021   ],
        [0.06600562, 0.25477883, 0.04627686, 0.29779077]],

       [[0.02357975, 0.529625  , 0.06215126, 0.2055262 ],
        [0.03677557, 0.38415322, 0.23065004, 0.14070515],
        [0.2180725 , 0.2075127 , 0.07500987, 0.1495332 ]],

       [[0.01297671, 0.05138903, 0.07086549, 0.35554573],
        [0.02023881, 0.03727404, 0.26298952, 0.24340992],
        [0.1200125 , 0.02013477, 0.08552703, 0.25868183]]], dtype=float32)

In [81]:
#pt2
tensor = torch.randn(5,5, requires_grad = True) #1
print('#1', tensor)
sum_t = torch.sum(tensor) #2
sum_t.retain_grad()
print('#2', sum_t)
mul_t = torch.mul(tensor, 2) #3
mul_t.retain_grad()
print('#3', mul_t)
sum_t2 = torch.sum(mul_t) #4
print('#4', sum_t2)
sum_t2.retain_grad()

print('#1 ', tensor.grad, '\n#2 ', sum_t.grad, '\n#3 ', mul_t.grad, '\n#4 ', sum_t2.grad)
sum_t2.backward()
print('#1 ', tensor.grad, '\n#2 ', sum_t.grad, '\n#3 ', mul_t.grad, '\n#4 ', sum_t2.grad)

#1 tensor([[-0.8010, -0.1690,  0.3360,  0.7277, -0.1743],
        [ 0.0911,  0.2875,  1.4629, -0.4733, -0.0430],
        [ 1.6839,  0.9654,  0.8610, -0.3237,  0.6875],
        [ 0.0566, -2.1928, -0.7458, -3.3702, -1.2937],
        [-0.7990, -0.3539, -0.7029,  0.1029,  0.6222]], requires_grad=True)
#2 tensor(-3.5578, grad_fn=<SumBackward0>)
#3 tensor([[-1.6020, -0.3380,  0.6721,  1.4555, -0.3485],
        [ 0.1822,  0.5749,  2.9258, -0.9467, -0.0859],
        [ 3.3678,  1.9309,  1.7220, -0.6475,  1.3750],
        [ 0.1132, -4.3855, -1.4917, -6.7404, -2.5873],
        [-1.5979, -0.7078, -1.4059,  0.2058,  1.2443]], grad_fn=<MulBackward0>)
#4 tensor(-7.1157, grad_fn=<SumBackward0>)
#1  None 
#2  None 
#3  None 
#4  None
#1  tensor([[2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.]]) 
#2  None 
#3  tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 

In [83]:
#pt3
def initialize_parameters_deep(layer_dims):
    torch.manual_seed(42)
    parameters = {}
    L = len(layer_dims) # number of layers in the network

    for l in range(1, L):
        #(≈ 2 lines of code)
        # parameters['W' + str(l)] = ...
        # parameters['b' + str(l)] = ...
        # YOUR CODE STARTS HERE
        parameters['W' + str(l)] = torch.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = torch.zeros((layer_dims[l], 1))
        # YOUR CODE ENDS HERE
        
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

        
    return parameters

In [89]:
def L_model_forward(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2                  # number of layers in the neural network
    
    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    # The for loop starts at 1 because layer 0 is the input
    for l in range(1, L):
        A_prev = A 
        #(≈ 2 lines of code)
        # A, cache = ...
        # caches ...
        # YOUR CODE STARTS HERE
        A, cache = linear_activation_forward(A_prev,  parameters['W' + str(l)], parameters['b' + str(l)], 'relu')
        caches.append(cache)
        # YOUR CODE ENDS HERE
    
    # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.
    #(≈ 2 lines of code)
    # AL, cache = ...
    # caches ...
    # YOUR CODE STARTS HERE
    AL, cache = linear_activation_forward(A,  parameters['W' + str(L)], parameters['b' + str(L)], 'sigmoid')
    caches.append(cache)
    
    # YOUR CODE ENDS HERE

    return AL, caches

## Implementing a deep convolutional neural network using PyTorch

### The multilayer CNN architecture

In [None]:
from IPython.display import Image
%matplotlib inline
Image(filename='figures/14_12.png', width=800)

### Loading and preprocessing the data

In [None]:
import torch
import numpy as np
import torchvision 
from torchvision import transforms 
image_path = './'
transform = transforms.Compose([transforms.ToTensor()])

mnist_dataset = torchvision.datasets.MNIST(root=image_path, 
                                           train=True, 
                                           transform=transform, 
                                           download=True)

from torch.utils.data import Subset
mnist_valid_dataset = Subset(mnist_dataset, torch.arange(10000)) 
mnist_train_dataset = Subset(mnist_dataset, torch.arange(10000, len(mnist_dataset)))
mnist_test_dataset = torchvision.datasets.MNIST(root=image_path, 
                                           train=False, 
                                           transform=transform, 
                                           download=False)

In [None]:
from torch.utils.data import DataLoader


batch_size = 64
torch.manual_seed(1)
train_dl = DataLoader(mnist_train_dataset, batch_size, shuffle=True)
valid_dl = DataLoader(mnist_valid_dataset, batch_size, shuffle=False)

### Implementing a CNN using the torch.nn module

#### Configuring CNN layers in PyTorch

 * **Conv2d:** `torch.nn.Conv2d`
   * `out_channels`
   * `kernel_size`
   * `stride`
   * `padding`
   
   
 * **MaxPool2d:** `torch.nn.MaxPool2d`
   * `kernel_size`
   * `stride`
   * `padding`
   
   
 * **Dropout** `torch.nn.Dropout`
   * `p`

### Constructing a CNN in PyTorch

In [None]:
import torch.nn as nn
model = nn.Sequential()
model.add_module('conv1', nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5, padding=2))
model.add_module('relu1', nn.ReLU())        
model.add_module('pool1', nn.MaxPool2d(kernel_size=2))   
model.add_module('conv2', nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=2))
model.add_module('relu2', nn.ReLU())        
model.add_module('pool2', nn.MaxPool2d(kernel_size=2))      

x = torch.ones((4, 1, 28, 28))
model(x).shape

In [None]:
model.add_module('flatten', nn.Flatten()) 

x = torch.ones((4, 1, 28, 28))
model(x).shape

In [None]:
model.add_module('fc1', nn.Linear(3136, 1024)) 
model.add_module('relu3', nn.ReLU()) 
model.add_module('dropout', nn.Dropout(p=0.5)) 

model.add_module('fc2', nn.Linear(1024, 10)) 

In [None]:
device = torch.device("mps")
#device = torch.device("cpu")

model = model.to(device) 

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train(model, num_epochs, train_dl, valid_dl):
    loss_hist_train = [0] * num_epochs
    accuracy_hist_train = [0] * num_epochs
    loss_hist_valid = [0] * num_epochs
    accuracy_hist_valid = [0] * num_epochs
    for epoch in range(num_epochs):
        model.train()
        for x_batch, y_batch in train_dl:
            x_batch = x_batch.to(device) 
            y_batch = y_batch.to(device) 
            pred = model(x_batch)
            loss = loss_fn(pred, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_hist_train[epoch] += loss.item()*y_batch.size(0)
            is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
            accuracy_hist_train[epoch] += is_correct.sum().cpu()

        loss_hist_train[epoch] /= len(train_dl.dataset)
        accuracy_hist_train[epoch] /= len(train_dl.dataset)
        
        model.eval()
        with torch.no_grad():
            for x_batch, y_batch in valid_dl:
                x_batch = x_batch.to(device) 
                y_batch = y_batch.to(device) 
                pred = model(x_batch)
                loss = loss_fn(pred, y_batch)
                loss_hist_valid[epoch] += loss.item()*y_batch.size(0) 
                is_correct = (torch.argmax(pred, dim=1) == y_batch).float() 
                accuracy_hist_valid[epoch] += is_correct.sum().cpu()

        loss_hist_valid[epoch] /= len(valid_dl.dataset)
        accuracy_hist_valid[epoch] /= len(valid_dl.dataset)
        
        print(f'Epoch {epoch+1} accuracy: {accuracy_hist_train[epoch]:.4f} val_accuracy: {accuracy_hist_valid[epoch]:.4f}')
    return loss_hist_train, loss_hist_valid, accuracy_hist_train, accuracy_hist_valid

torch.manual_seed(1)
num_epochs = 20
hist = train(model, num_epochs, train_dl, valid_dl)

In [None]:
import matplotlib.pyplot as plt


x_arr = np.arange(len(hist[0])) + 1

fig = plt.figure(figsize=(12, 4))
ax = fig.add_subplot(1, 2, 1)
ax.plot(x_arr, hist[0], '-o', label='Train loss')
ax.plot(x_arr, hist[1], '--<', label='Validation loss')
ax.set_xlabel('Epoch', size=15)
ax.set_ylabel('Loss', size=15)
ax.legend(fontsize=15)
ax = fig.add_subplot(1, 2, 2)
ax.plot(x_arr, hist[2], '-o', label='Train acc.')
ax.plot(x_arr, hist[3], '--<', label='Validation acc.')
ax.legend(fontsize=15)
ax.set_xlabel('Epoch', size=15)
ax.set_ylabel('Accuracy', size=15)

#plt.savefig('figures/14_13.png')
plt.show()

In [None]:
torch.mps.synchronize()
model_cpu = model.cpu()
pred = model(mnist_test_dataset.data.unsqueeze(1) / 255.)
is_correct = (torch.argmax(pred, dim=1) == mnist_test_dataset.targets).float()
print(f'Test accuracy: {is_correct.mean():.4f}') 

In [None]:
fig = plt.figure(figsize=(12, 4))
for i in range(12):
    ax = fig.add_subplot(2, 6, i+1)
    ax.set_xticks([]); ax.set_yticks([])
    img = mnist_test_dataset[i][0][0, :, :]
    pred = model(img.unsqueeze(0).unsqueeze(1))
    y_pred = torch.argmax(pred)
    ax.imshow(img, cmap='gray_r')
    ax.text(0.9, 0.1, y_pred.item(), 
            size=15, color='blue',
            horizontalalignment='center',
            verticalalignment='center', 
            transform=ax.transAxes)
    
    
#plt.savefig('figures/14_14.png')
plt.show()

In [None]:
import os

if not os.path.exists('models'):
    os.mkdir('models')

path = 'models/mnist-cnn.ph'
torch.save(model, path)
 