## PyTorch Tutorial
```
- Machine Learning, Innopolis University 
- Professor: Adil Khan 
- Teaching Assistant: Gcinizwe Dlamini
```
<hr>



In [None]:
import numpy as np
from matplotlib import pyplot as plt 
np.random.seed(73)

## <center> Linear Regression with Numpy<center>

### Data Generation

In [None]:
# Data Generation
def generate_data(size = 100):
    x = np.random.rand(size, 1)
    y = 3 + 2.5 * x + .1 * np.random.randn(size, 1)

    # Shuffles the indices
    idx = np.arange(size)
    np.random.shuffle(idx)

    # split to train and validation 80:20
    split = int(size * 0.8)
    train_idx = idx[:split]
    val_idx = idx[split:]

    # Generate train and validation sets
    x_train, y_train = x[train_idx], y[train_idx]
    x_val, y_val = x[val_idx], y[val_idx]

    return x_train, y_train, x_val, y_val

In [None]:
x_train, y_train, x_val, y_val = generate_data()
plt.scatter(x_train, y_train)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Title")
plt.show()

## Gradient Descent

Gradient descent consist of 3 basic steps : 

1. **Compute the Loss**

$$ \hat{y} = a + bx + \epsilon $$

$$ \text{MSE} = \frac{1}{N} \sum_{i} (y_i - \hat{y}_i)^2 $$

$$ \text{MSE} = \frac{1}{N} \sum_{i} (y_i - a - bx_i)^2 $$

2. **Compute the Gradients** : A gradient is a partial derivative. Using the chain rule the final expression came to be : 

$$\frac{\partial \text{MSE}}{\partial a} = \frac{\partial \text{MSE}}{\partial \hat{y}} * \frac{\partial \hat{y}}{\partial a} = -2 * \frac{1}{N} \sum_{i} (y_i - \hat{y}_i)$$

$$\frac{\partial \text{MSE}}{\partial b} = \frac{\partial \text{MSE}}{\partial \hat{y}} * \frac{\partial \hat{y}}{\partial b} = -2 * \frac{1}{N} \sum_{i} x_i(y_i - \hat{y}_i)$$

3. **Update the Parameters**

$$a = a - \alpha \frac{\partial \text{MSE}}{\partial a}$$

$$b = b - \alpha \frac{\partial \text{MSE}}{\partial b}$$

4. Repeat step 1 to 3 till convergence is reached


### TASK : Implement Step 1 - 3

In [None]:
# Initializes parameters "a" and "b" randomly

a = np.random.randn(1)
b = np.random.randn(1)

print(f"Initial values of [a, b] : [{a[0]}, {b[0]}]")

learning_rate = 1e-1 #learning rate
n_epochs = 1000

for epoch in range(n_epochs):
  
    # TODO: Step 1: Computes y hat
    yhat = None

    # TODO: Compute error and Loss using MSE 
    error = None
    loss = None

    # TODO : Step 2: Compute gradients for both "a" and "b" parameters (partial derivatives)
    a_grad = None
    b_grad = None

    # TODO : Step 3: Update parameters using gradients and the learning rate
    a = a - learning_rate * a_grad
    b = b - learning_rate * b_grad
    
print(f"Final values of [a, b] : [{a[0]}, {b[0]}]")

## PyTorch implementation

1. Simple model using nn.Sequential(..)
1. Read input data from file 
1. Add Tensorboard
1. Transfer Learning

In [None]:
import torch 
from torch import nn
import torch.optim as optim # for optimizer

## Transfer Learning

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch. utils.data import DataLoader

### Read Data from local Folder

In [None]:
!git clone https://github.com/YoongiKim/CIFAR-10-images.git

## Create a CNN Model, Train and Test

In [None]:
def train(model, device, train_loader, optimizer, criterion):
    model.train()
    tot_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        tot_loss += loss.item()
        loss.backward()
        optimizer.step()
    return tot_loss/len(train_loader)
            
def test( model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return test_loss, correct/len(test_loader.dataset)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model_cnn = Net().to(device)

## Add TensorBoard

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

## Use pretrained Models

In [None]:
import torchvision.models as models
resnet18 = models.resnet18(pretrained=True,progress=True)
alexnet = models.alexnet(pretrained=True,progress=True)
squeezenet = models.squeezenet1_0(pretrained=True,progress=True)
vgg16 = models.vgg16(pretrained=True,progress=True)

In [None]:
transform = transforms.Compose(
    [transforms.Resize((256,256)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.net = models.resnet18(pretrained=True,progress=True)
        self.net.trainable = False
        self.net.fc = nn.Linear(512, 10)

    def forward(self, x):
        return self.net(x)


net = Net()
net = net.to(device)

## <center> PyTorch Basics<center>

### Tensors 

* How to create a Tensor
* Operations on tensors
* Data types for Tensors

### Create a Tensor 

1. Create tensors from Numpy then see what operations can be applied.
**Note:** By default a tensor resides in cpu but can be sent to the GPU for fatser computations

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

# Here we can see the difference - notice that .type() is more useful
# since it tells WHERE the tensor device

print(type(x_train), type(x_train_tensor), x_train_tensor.type())

### Dynamic Computation Graph

* Easily visualize a graph using `PyTorchViz` package. 

In [None]:
!pip install torchviz 
from torchviz import make_dot

a = torch.randn(1,requires_grad=True,device=device)
b = torch.randn(1,requires_grad=True,device=device)
c = torch.randn(1,requires_grad=True,device=device)
d = torch.randn(1,requires_grad=True,device=device)

f = a**b + b*0.5 + c**3 - d*a

In [None]:
make_dot(f)