In [1]:
import torch

### Optimizers

In [2]:
import torch.optim as optim

In [3]:
# SGD

sgd = optim.SGD()

TypeError: SGD.__init__() missing 1 required positional argument: 'params'

The parameters which needs to be optimized and the learning rate (optional) should be passed into the `SGD` class

## Artificial Neural Networks

```
nn.Linear
nn.Module
nn.Sequential
```

### `nn.Linear` -> Fully Connected Layer

In [4]:
import torch
import torch.nn as nn

layer1 = nn.Linear(in_features=3, out_features=2)

input1 = torch.tensor([5., 8, 78])

output1 = layer1(input1)

output1

tensor([ 40.2087, -24.8985], grad_fn=<ViewBackward0>)

- PyTorch uses **Kaiming Uniform** (also known as **He initialization**) for weight initialization in the `nn.Linear` layers.
- The above code doesn't invlove any activation functions, so activation functions are not used.

In [5]:
layer1.weight

Parameter containing:
tensor([[ 0.2273, -0.2373,  0.5208],
        [-0.0505, -0.3087, -0.2887]], requires_grad=True)

In [6]:
layer1.weight.data

tensor([[ 0.2273, -0.2373,  0.5208],
        [-0.0505, -0.3087, -0.2887]])

In [7]:
type(layer1.weight)

torch.nn.parameter.Parameter

In [8]:
layer1.bias

Parameter containing:
tensor([0.3498, 0.3395], requires_grad=True)

Everytime, we run this code, the `output`, `weights` and `biases` all changes

### We can also pass in our custom weights

In [9]:
new_weights = torch.tensor([[1., 5., 7], [9, 8, 12]])
new_bias = torch.tensor([34., 67])

layer1.weight.data = new_weights
layer1.bias.data = new_bias

layer1(input1)

tensor([ 625., 1112.], grad_fn=<ViewBackward0>)

Whenever we assign weights from our side, eventhough we didn't explicitly set `requires_grad=True`, it would automatically be set when it is passes into the layer

In [10]:
new_weights.requires_grad

False

In [11]:
layer1.weight.requires_grad

True

#### We can also send in inputs as batches and get the output as batches 

In [12]:
batch_input = torch.tensor([[2., 5, 6], [4, 3, 12], [3, 5, 90], [45, 32, 12]])

batch_output = layer1(batch_input)

batch_output

tensor([[ 103.,  197.],
        [ 137.,  271.],
        [ 692., 1214.],
        [ 323.,  872.]], grad_fn=<AddmmBackward0>)

### `nn.Module`

If we wanted to create a model, we need to create a class for that model, that should inherit from the class `nn.Module`

In [13]:
class LinearRegressionModel(nn.Module):

    def __init__(self):
        super(LinearRegressionModel, self).__init__()
        self.layer1 = nn.Linear(in_features=1, out_features=1)

    def forward(self, x):
        return self.layer1(x)  

In the `__init__` function, we need to define the attributes which we will be using in our network

### Checking if `cuda` is available

In [14]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
model = LinearRegressionModel().to(device)

X = torch.linspace(0, 10, 15).reshape(-1, 1) # reshaping into a single column to feed them as a batch

pred = model(X)

pred

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

If we are using the device to be `cuda` for the model, then the corresponding inputs also should be passed to `cuda`

In [16]:
model = LinearRegressionModel().to(device)

X = torch.linspace(0, 10, 15).reshape(-1, 1) # reshaping into a single column to feed them as a batch

pred = model(X.to(device))

pred

tensor([[ 0.7296],
        [ 0.5647],
        [ 0.3999],
        [ 0.2351],
        [ 0.0702],
        [-0.0946],
        [-0.2595],
        [-0.4243],
        [-0.5892],
        [-0.7540],
        [-0.9189],
        [-1.0837],
        [-1.2485],
        [-1.4134],
        [-1.5782]], device='cuda:0', grad_fn=<AddmmBackward0>)

##### NOTE : There is no training happening here, the outputs are based on randomly initialized weights

We can access the weights and biases of each layer

In [17]:
model.layer1.weight

Parameter containing:
tensor([[-0.2308]], device='cuda:0', requires_grad=True)

In [18]:
model.layer1.bias

Parameter containing:
tensor([0.7296], device='cuda:0', requires_grad=True)

#### Let us create a little more complicated architecture

In [19]:
class SimpleModel(nn.Module):

    def __init__(self):
        super(SimpleModel, self).__init__()
        self.layer1 = nn.Linear(10, 5) # We can pass in the atributes as positional arguments
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(5, 1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

In [20]:
net = SimpleModel().to(device)
net

SimpleModel(
  (layer1): Linear(in_features=10, out_features=5, bias=True)
  (relu): ReLU()
  (layer2): Linear(in_features=5, out_features=1, bias=True)
)

Let us test this architecture

In [21]:
input = torch.randn(1, 10, device='cuda') # This function samples random numbers from normal gaussian distribution

input

tensor([[ 0.0523,  0.2966,  0.2654, -0.8964, -0.1073,  0.1461, -0.5090, -0.8233,
         -0.0809, -0.7292]], device='cuda:0')

In [22]:
output = net(input)

output

tensor([[0.4151]], device='cuda:0', grad_fn=<AddmmBackward0>)

#### Accessing `weights` and `biases`

In [23]:
print(net.layer1.weight)
print(net.layer1.bias)
print(net.layer2.weight)
print(net.layer2.bias)

Parameter containing:
tensor([[ 0.2644, -0.0841, -0.1459,  0.2434,  0.1147, -0.1956, -0.1965, -0.2665,
          0.2801,  0.0398],
        [ 0.3009, -0.2574,  0.2046, -0.1350, -0.1687, -0.0546,  0.0036, -0.2978,
          0.1113, -0.0969],
        [ 0.1085, -0.0845, -0.0679,  0.0925,  0.2768,  0.1475, -0.1495,  0.1683,
         -0.1441, -0.0730],
        [-0.1590,  0.2901,  0.1041, -0.2682, -0.0775,  0.1011,  0.2388, -0.1736,
         -0.2450,  0.0178],
        [-0.2974, -0.0525, -0.0943,  0.1964, -0.1471,  0.1625, -0.1029,  0.2435,
          0.2896,  0.2747]], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.1117,  0.0657, -0.2407,  0.2795, -0.3070], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([[ 0.0062, -0.1048, -0.0991,  0.2970,  0.1173]], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([0.2661], device='cuda:0', requires_grad=True)


We can also access all the parameters of the architecture in a single function

In [24]:
net.parameters()

<generator object Module.parameters at 0x7f25285c4d60>

This is a generator object, we can iterate this to print its components

In [25]:
for params in net.parameters():
    print(params)

Parameter containing:
tensor([[ 0.2644, -0.0841, -0.1459,  0.2434,  0.1147, -0.1956, -0.1965, -0.2665,
          0.2801,  0.0398],
        [ 0.3009, -0.2574,  0.2046, -0.1350, -0.1687, -0.0546,  0.0036, -0.2978,
          0.1113, -0.0969],
        [ 0.1085, -0.0845, -0.0679,  0.0925,  0.2768,  0.1475, -0.1495,  0.1683,
         -0.1441, -0.0730],
        [-0.1590,  0.2901,  0.1041, -0.2682, -0.0775,  0.1011,  0.2388, -0.1736,
         -0.2450,  0.0178],
        [-0.2974, -0.0525, -0.0943,  0.1964, -0.1471,  0.1625, -0.1029,  0.2435,
          0.2896,  0.2747]], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.1117,  0.0657, -0.2407,  0.2795, -0.3070], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([[ 0.0062, -0.1048, -0.0991,  0.2970,  0.1173]], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([0.2661], device='cuda:0', requires_grad=True)


This is same as what we maually printed above, so the parameters function stores the parameters sequentially with respect to forward pass

Let us cross check the shape of each of the parameters

In [26]:
for params in net.parameters():
    print(params.shape)

torch.Size([5, 10])
torch.Size([5])
torch.Size([1, 5])
torch.Size([1])


### `nn.Sequential`

We could create same kind of neural net architecture using this `Sequential` module too

In [27]:
model = nn.Sequential(
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Linear(5, 1)
).to(device)

print(model)

Sequential(
  (0): Linear(in_features=10, out_features=5, bias=True)
  (1): ReLU()
  (2): Linear(in_features=5, out_features=1, bias=True)
)


There is no creation of class involved simple creation and calling similar to `TensorFlow`

In [28]:
input = torch.randn(1, 10, device='cuda')
output = model(input)
output

tensor([[-0.2947]], device='cuda:0', grad_fn=<AddmmBackward0>)

Here we haven't explicitly named the layers, so can't call these individual layers by name, but we can access it using its indices

In [29]:
print(model[0].weight)
print(model[0].bias)
print(model[2].weight)
print(model[2].bias)

Parameter containing:
tensor([[ 0.2549, -0.0182,  0.2610, -0.1604,  0.2725,  0.0594, -0.0114, -0.0512,
          0.2818,  0.0081],
        [-0.1267,  0.2653, -0.1773,  0.2910, -0.0359,  0.0135,  0.1166, -0.3002,
         -0.2791,  0.1083],
        [-0.2520,  0.2365, -0.1996,  0.2720, -0.2562, -0.2832,  0.2073, -0.0305,
         -0.1315, -0.1527],
        [-0.1308,  0.2541, -0.0111, -0.0506,  0.0829,  0.1642, -0.0962,  0.0690,
          0.0524, -0.1591],
        [-0.2239, -0.1286, -0.1554, -0.1998, -0.2868, -0.2321, -0.1363, -0.2854,
         -0.1585, -0.1241]], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.2813, -0.3044, -0.1827,  0.0463, -0.0414], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([[ 0.0906,  0.2802,  0.1067, -0.3017, -0.1425]], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([-0.2947], device='cuda:0', requires_grad=True)


When the flow of the network is not sequential or is little much complex, it is always better to prefer class method of initializing the network

## Training

In [30]:
class NewModel(nn.Module):
    def __init__(self):
        super(NewModel, self).__init__()
        self.layer1 = nn.Linear(10, 5)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(5, 1)
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

# Creating random inputs and targets
inputs = torch.randn(100, 10, device='cuda')
targets = torch.randn(100, 1, device='cuda')

# Initializing the model
net = NewModel().to(device)

# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

for epoch in range(1000):
    # deleting the gradients accumulated in the optimizer
    optimizer.zero_grad()

    # Forward pass
    outputs = net(inputs)

    # Loss computation
    loss = criterion(outputs, targets)

    # Backward pass
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print("Epoch :", epoch, "Loss :", loss.item())    

Epoch : 0 Loss : 1.1393691301345825
Epoch : 100 Loss : 1.0666722059249878
Epoch : 200 Loss : 1.0147347450256348
Epoch : 300 Loss : 0.9591621160507202
Epoch : 400 Loss : 0.9025018811225891
Epoch : 500 Loss : 0.8571661114692688
Epoch : 600 Loss : 0.8176199197769165
Epoch : 700 Loss : 0.7862420678138733
Epoch : 800 Loss : 0.7647578120231628
Epoch : 900 Loss : 0.7462576031684875


- `loss.backward()` will do the backprop and store all the gradients in the `variable.grad`
- `optimizer.step()` will take these gradients and update the weights and biases in the network

### Saving the model

In [31]:
torch.save(net, 'NewModel.pt')

## General steps to use an Optimizer

1. Zero the gradients (`optimizer.zero_grad()`)
2. Compute the output (**Forward Pass**)
3. Compute the loss
4. Perform Backpropagation (`loss.backward()`)
5. Update the parameters (`optimizer.step()`)

### ANN Pratical Example

In [32]:
import numpy as np

X = np.random.rand(100, 1)*10
true_w = 3
true_b = 2

y = true_w*X + true_b + np.random.randn(100, 1)
y

array([[11.63372971],
       [18.28983954],
       [24.43092073],
       [ 3.8195586 ],
       [12.76322617],
       [12.96709416],
       [ 6.96006959],
       [20.40287098],
       [11.94817018],
       [33.2537514 ],
       [17.35023922],
       [12.25429802],
       [10.73072924],
       [27.38448129],
       [ 4.54690019],
       [ 9.64828572],
       [ 4.68295233],
       [26.05124116],
       [23.25290643],
       [ 7.86587409],
       [ 8.61071717],
       [ 6.54809278],
       [ 6.07171251],
       [29.66525869],
       [20.5077538 ],
       [23.66472099],
       [26.43899279],
       [29.53215063],
       [10.98338677],
       [ 4.84543538],
       [ 7.63473347],
       [26.07908336],
       [ 6.82088007],
       [23.57632765],
       [21.59728661],
       [ 6.62908682],
       [13.69267172],
       [27.83709377],
       [26.42383826],
       [ 5.09547195],
       [ 8.65526774],
       [26.02229582],
       [ 2.85716174],
       [13.06541412],
       [12.40309881],
       [24

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [34]:
X_train = torch.tensor(X_train, dtype=torch.float32, device='cuda')
X_test = torch.tensor(X_test, dtype=torch.float32, device='cuda')
y_train = torch.tensor(y_train, dtype=torch.float32, device='cuda')
y_test = torch.tensor(y_test, dtype=torch.float32, device='cuda')

In [35]:
class LinearRegression(nn.Module):

  def __init__(self):
    super(LinearRegression, self).__init__()
    self.linear = nn.Linear(1, 1)

  def forward(self, x):
    return self.linear(x)

model = LinearRegression().to(device)

In [36]:
print(model.linear.weight.item())
print(model.linear.bias.item())

0.8373137712478638
-0.9680311679840088


In [37]:
criterion = nn.MSELoss()

optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(1000):
  optimizer.zero_grad()
  outputs = model(X_train)
  loss = criterion(outputs, y_train)
  loss.backward()
  optimizer.step()

  if (epoch+1) % 100 == 0:
    print("Epoch :", epoch, "Loss :", loss.item())


Epoch : 99 Loss : 1.576372742652893
Epoch : 199 Loss : 1.267047643661499
Epoch : 299 Loss : 1.1508183479309082
Epoch : 399 Loss : 1.107145071029663
Epoch : 499 Loss : 1.090734601020813
Epoch : 599 Loss : 1.0845683813095093
Epoch : 699 Loss : 1.0822514295578003
Epoch : 799 Loss : 1.081380844116211
Epoch : 899 Loss : 1.0810539722442627
Epoch : 999 Loss : 1.0809309482574463


In [38]:
with torch.no_grad():
  pred = model(X_test)
  test_loss = criterion(pred, y_test)

print(test_loss)

tensor(0.7884, device='cuda:0')


`torch.no_grad()` is used to save memory, we should use it only for inference because `requires_grad=False` will be set by default for all the processes running inside the block of `torch.no_grad()`

In [39]:
print(model.linear.weight.item())
print(model.linear.bias.item())

3.023601770401001
1.7040432691574097


which is very close to the true weight and bias

### Real World Example

In [40]:
from sklearn.datasets import fetch_california_housing

dataset = fetch_california_housing()

dataset

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [41]:
dataset.data.shape

(20640, 8)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

X_train = torch.tensor(X_train, dtype=torch.float32, device='cuda')
X_test = torch.tensor(X_test, dtype=torch.float32, device='cuda')
y_train = torch.tensor(y_train, dtype=torch.float32, device='cuda')
y_test = torch.tensor(y_test, dtype=torch.float32, device='cuda')

In [43]:
class HousePriceNet(nn.Module):

  def __init__(self):
    super(HousePriceNet, self).__init__()
    self.layer1 = nn.Linear(8, 5)
    self.relu = nn.ReLU()
    self.layer2 = nn.Linear(5, 1)

  def forward(self, x):
    x = self.layer1(x)
    x = self.relu(x)
    x = self.layer2(x)
    return x
  
hpn = HousePriceNet().to(device)

In [44]:
mse = nn.MSELoss()

optimizer = optim.SGD(hpn.parameters(), lr=0.01)

for epoch in range(1000):
  optimizer.zero_grad()
  output = hpn(X_train)
  loss = mse(output.reshape(16512), y_train.reshape(16512))
  loss.backward()
  optimizer.step()

  if (epoch % 100) == 0:
    print("Epoch :", epoch, "Loss :", loss.item())


Epoch : 0 Loss : 18969.583984375
Epoch : 100 Loss : 863394004992.0
Epoch : 200 Loss : 15185329152.0
Epoch : 300 Loss : 267078944.0
Epoch : 400 Loss : 4697374.5
Epoch : 500 Loss : 82618.484375
Epoch : 600 Loss : 1454.4029541015625
Epoch : 700 Loss : 26.89315414428711
Epoch : 800 Loss : 1.7861838340759277
Epoch : 900 Loss : 1.3446040153503418


In [45]:
with torch.no_grad():
  pred = hpn(X_test)
  test_loss = criterion(pred, y_test)

print(test_loss)

tensor(1.3111, device='cuda:0')


  return F.mse_loss(input, target, reduction=self.reduction)


Thus we have constructed, trained and evaluated an ANN using Pytorch on a real-world dataset