### PyTorch Building a Model
#### LINK: https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html

In [5]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [11]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [12]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [16]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred =  pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([5], device='cuda:0')


In [18]:
input_image = torch.rand(3, 28, 28)
print(input_image.size())

torch.Size([3, 28, 28])


In [20]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


In [21]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


In [22]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 0.1035, -0.0135, -0.5511, -0.1003, -0.9261, -0.4185,  0.1529, -0.7432,
          0.0191, -0.2835,  0.6657, -0.3055,  0.8433, -0.0525, -0.0291, -0.2603,
         -0.3643,  0.1342, -0.2791, -0.1942],
        [-0.0685,  0.0250, -0.2871,  0.1251, -0.6783, -0.3011, -0.0920, -0.6878,
         -0.3220, -0.1649,  0.5057, -0.2445,  0.6750,  0.1841, -0.0795, -0.1923,
         -0.3218, -0.0403, -0.4453, -0.1763],
        [ 0.1118,  0.1348, -0.4441,  0.2330, -0.7526, -0.1250,  0.1076, -0.5817,
         -0.2912,  0.0546,  0.8322,  0.0706,  0.8581, -0.0554, -0.0958, -0.2699,
         -0.5522,  0.2274,  0.0465, -0.1030]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.1035, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1529, 0.0000, 0.0191,
         0.0000, 0.6657, 0.0000, 0.8433, 0.0000, 0.0000, 0.0000, 0.0000, 0.1342,
         0.0000, 0.0000],
        [0.0000, 0.0250, 0.0000, 0.1251, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.5057, 0.0000, 0.6750, 0.1841, 0.00

In [25]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)
print(logits.size())
print(logits)

torch.Size([3, 10])
tensor([[ 0.0470,  0.3495, -0.1065,  0.3001, -0.2620,  0.2499, -0.0930, -0.2885,
         -0.2120, -0.2822],
        [ 0.0093,  0.3299, -0.1286,  0.3054, -0.2957,  0.2588, -0.1097, -0.2925,
         -0.1159, -0.2835],
        [ 0.0484,  0.3315, -0.0646,  0.2494, -0.2067,  0.2335, -0.0009, -0.2923,
         -0.2087, -0.2279]], grad_fn=<AddmmBackward0>)


In [28]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)
print(pred_probab)
print(pred_probab.argmax(1))

tensor([[0.1049, 0.1419, 0.0900, 0.1351, 0.0770, 0.1285, 0.0912, 0.0750, 0.0810,
         0.0755],
        [0.1013, 0.1396, 0.0883, 0.1362, 0.0747, 0.1300, 0.0900, 0.0749, 0.0894,
         0.0756],
        [0.1040, 0.1380, 0.0929, 0.1271, 0.0806, 0.1251, 0.0990, 0.0740, 0.0804,
         0.0789]], grad_fn=<SoftmaxBackward0>)
tensor([1, 1, 1])


In [29]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size} | Values: {param[:2]} \n")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: <built-in method size of Parameter object at 0x7f5c2a794ae0> | Values: tensor([[ 0.0052,  0.0217, -0.0304,  ..., -0.0103, -0.0089, -0.0202],
        [-0.0245, -0.0278, -0.0192,  ..., -0.0316, -0.0190,  0.0257]],
       device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: <built-in method size of Parameter object at 0x7f5c2a58d710> | Values: tensor([-0.0244, -0.0026], device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: <built-in method size of Parameter object at 0x7f5c2a6bc950> | Values: tensor([[ 0.0246, -0.0278,  0.0348,  ..., -0.0006, -0.0384, 

### PyTorch Autograd
#### LINK: https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html

In [33]:
# Consider the simplest one-layer neural network, with input x, parameters w and b, and some loss function. 
# It can be defined in PyTorch in the following manner:

x = torch.ones(5) # input tensor
y = torch.ones(3) # expected output
w = torch.randn(5, 3, requires_grad=True) # weights
b = torch.rand(3, requires_grad=True) # bias
z = torch.matmul(x, w) + b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

In [34]:
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

Gradient function for z = <AddBackward0 object at 0x7f5c2a6da920>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x7f5c2a6dab00>


In [40]:
# Computing gradients
loss.backward()
print(w.grad)
print(b.grad)

tensor([[-0.3090, -0.2646, -0.2011],
        [-0.3090, -0.2646, -0.2011],
        [-0.3090, -0.2646, -0.2011],
        [-0.3090, -0.2646, -0.2011],
        [-0.3090, -0.2646, -0.2011]])
tensor([-0.3090, -0.2646, -0.2011])


In [41]:
# Disabling gradient tracking
z = torch.matmul(x, w) + b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w) + b
print(z.requires_grad)

True
False


In [42]:
z = torch.matmul(x, w) + b
z_det = z.detach()
print(z_det.requires_grad)

False


In [44]:
# Optional: Tensor Gradients and Jacobian Products
inp = torch.eye(5, requires_grad=True)
out = (inp+1).pow(2)
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"First call\n{inp.grad}")
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")

First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.],
        [4., 4., 4., 4., 8.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])
