In [1]:
import torch
import numpy as np

In [2]:
# Input (temp, rainfall, humidity)
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70]], dtype='float32')

# Targets (apples, oranges)
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119]], dtype='float32')

In [3]:
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


## Linear regression model from scratch
The weights and biases (w11, w12,... w23, b1 & b2) can also be represented as matrices, initialized as random values. The first row of w and the first element of b are used to predict the first target variable, i.e., yield of apples, and similarly, the second for oranges.

In [4]:
w = torch.randn((2,3), requires_grad = True)
b = torch.randn((2,1), requires_grad = True)
print(w)
print(b)

tensor([[-0.5126,  0.2130,  0.4192],
        [-0.1567,  1.8306,  0.0690]], requires_grad=True)
tensor([[-1.7059],
        [-1.4407]], requires_grad=True)


In [5]:
def model(x):
    return x @ w.t() + b.t()

preds = model(inputs)
print(preds)
print(targets)

tensor([[ -6.8296, 112.7310],
        [ -2.7802, 149.8000],
        [  6.5517, 234.2193],
        [-29.3218,  63.8379],
        [ 12.7161, 168.3068]], grad_fn=<AddBackward0>)
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


## Loss function

In [6]:
def mse(pred, target):
    return torch.sum((pred-target) ** 2)/pred.numel()

loss = mse(preds, targets)
print(loss)

tensor(5200.0566, grad_fn=<DivBackward0>)


## Compute gradients

In [7]:
loss.backward()
print(w)
print(w.grad)

tensor([[-0.5126,  0.2130,  0.4192],
        [-0.1567,  1.8306,  0.0690]], requires_grad=True)
tensor([[-6691.5947, -7504.8809, -4560.8774],
        [ 4501.1748,  5321.6494,  3055.1663]])


In [8]:
with torch.no_grad():
    w -= 1e-4 * w.grad
    b -= 1e-4 * b.grad

Before we proceed, we reset the gradients to zero by invoking the .zero_() method. We need to do this because PyTorch accumulates gradients. Otherwise, the next time we invoke .backward on the loss, the new gradient values are added to the existing gradients, which may lead to unexpected results.

In [9]:
w.grad.zero_()
b.grad.zero_()
print(w)
print(b)

tensor([[ 0.1566,  0.9635,  0.8753],
        [-0.6069,  1.2984, -0.2365]], requires_grad=True)
tensor([[-1.6978],
        [-1.4461]], requires_grad=True)


In [10]:
preds = model(inputs)

In [11]:
loss = mse(preds, targets)

In [12]:
loss.backward()
w.grad

tensor([[ 5574.2925,  5691.5181,  3578.6870],
        [-3938.3252, -3774.3613, -2550.5220]])

In [13]:
with torch.no_grad():
    w -= 1e-4 * w.grad
    b -= 1e-4 * b.grad
    w.grad.zero_()
    b.grad.zero_()

print(w)
print(b)

tensor([[-0.4009,  0.3943,  0.5174],
        [-0.2130,  1.6758,  0.0185]], requires_grad=True)
tensor([[-1.7044],
        [-1.4415]], requires_grad=True)


## Train the model using gradient descent
As seen above, we reduce the loss and improve our model using the gradient descent optimization algorithm. Thus, we can train the model using the following steps:

Generate predictions

Calculate the loss

Compute gradients w.r.t the weights and biases

Adjust the weights by subtracting a small quantity proportional to the gradient

Reset the gradients to zero


## Train for multiple epochs

In [14]:
for i in range(1000):
    preds = model(inputs)
    loss = mse(preds, targets)
    loss.backward()
    with torch.no_grad():
        w -= 1e-4 * w.grad
        b -= 1e-4 * b.grad
        w.grad.zero_()
        b.grad.zero_()

In [15]:
loss = mse(model(inputs), targets)
print(loss)

tensor(0.4856, grad_fn=<DivBackward0>)


In [16]:
print(preds)
print(targets)

tensor([[ 56.8799,  70.0685],
        [ 82.3860, 100.8206],
        [118.7389, 132.9985],
        [ 21.1173,  37.0459],
        [101.8752, 119.0983]], grad_fn=<AddBackward0>)
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


## Linear regression using PyTorch built-ins

In [17]:
import torch.nn as nn

In [18]:
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70], 
                   [74, 66, 43], 
                   [91, 87, 65], 
                   [88, 134, 59], 
                   [101, 44, 37], 
                   [68, 96, 71], 
                   [73, 66, 44], 
                   [92, 87, 64], 
                   [87, 135, 57], 
                   [103, 43, 36], 
                   [68, 97, 70]], 
                  dtype='float32')

targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119],
                    [57, 69], 
                    [80, 102], 
                    [118, 132], 
                    [21, 38], 
                    [104, 118], 
                    [57, 69], 
                    [82, 100], 
                    [118, 134], 
                    [20, 38], 
                    [102, 120]], 
                   dtype='float32')

inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)

In [19]:
inputs

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.],
        [ 74.,  66.,  43.],
        [ 91.,  87.,  65.],
        [ 88., 134.,  59.],
        [101.,  44.,  37.],
        [ 68.,  96.,  71.],
        [ 73.,  66.,  44.],
        [ 92.,  87.,  64.],
        [ 87., 135.,  57.],
        [103.,  43.,  36.],
        [ 68.,  97.,  70.]])

## Dataset and DataLoader

In [20]:
from torch.utils.data import TensorDataset as td

train_ds = td(inputs, targets)
train_ds[:5, :]

(tensor([[ 73.,  67.,  43.],
         [ 91.,  88.,  64.],
         [ 87., 134.,  58.],
         [102.,  43.,  37.],
         [ 69.,  96.,  70.]]),
 tensor([[ 56.,  70.],
         [ 81., 101.],
         [119., 133.],
         [ 22.,  37.],
         [103., 119.]]))

The TensorDataset allows us to access a small section of the training data using the array indexing notation ([0:3] in the above code). It returns a tuple with two elements. The first element contains the input variables for the selected rows, and the second contains the targets.

In [21]:
from torch.utils.data import DataLoader

batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle = True)

for xb, yb in train_dl:
    print(xb)
    print(yb)
    break

tensor([[ 87., 135.,  57.],
        [101.,  44.,  37.],
        [ 74.,  66.,  43.],
        [ 91.,  87.,  65.],
        [103.,  43.,  36.]])
tensor([[118., 134.],
        [ 21.,  38.],
        [ 57.,  69.],
        [ 80., 102.],
        [ 20.,  38.]])


In each iteration, the data loader returns one batch of data with the given batch size. If shuffle is set to True, it shuffles the training data before creating batches. Shuffling helps randomize the input to the optimization algorithm, leading to a faster reduction in the loss.

## nn.Linear

In [22]:
model = nn.Linear(3, 2)
print(model.weight)
print(model.bias)

Parameter containing:
tensor([[ 0.1077,  0.4328, -0.3678],
        [-0.1171,  0.1437, -0.0890]], requires_grad=True)
Parameter containing:
tensor([ 0.4773, -0.3074], requires_grad=True)


In [23]:
list(model.parameters())

[Parameter containing:
 tensor([[ 0.1077,  0.4328, -0.3678],
         [-0.1171,  0.1437, -0.0890]], requires_grad=True),
 Parameter containing:
 tensor([ 0.4773, -0.3074], requires_grad=True)]

In [24]:
preds = model(inputs)
preds

tensor([[21.5272, -3.0514],
        [24.8325, -4.0100],
        [46.5189,  3.6047],
        [16.4700, -9.3619],
        [23.7184, -0.8189],
        [21.2021, -3.3123],
        [24.0319, -4.2428],
        [46.2589,  3.3986],
        [16.7951, -9.1011],
        [23.2429, -0.7909],
        [20.7266, -3.2842],
        [24.5074, -4.2708],
        [47.3196,  3.8375],
        [16.9455, -9.3899],
        [24.0435, -0.5581]], grad_fn=<AddmmBackward>)

In [25]:
#The nn.functional package contains many useful loss functions and several other utilities.
import torch.nn.functional as F
loss_fn = F.mse_loss

loss = loss_fn(model(inputs), targets)
loss

tensor(6551.1187, grad_fn=<MseLossBackward>)

## Optimizer
Instead of manually manipulating the model's weights & biases using gradients, we can use the optimizer optim.SGD. SGD is short for "stochastic gradient descent". The term stochastic indicates that samples are selected in random batches instead of as a single group.

In [26]:
opt = torch.optim.SGD(model.parameters(), lr = 1e-4)

Note that model.parameters() is passed as an argument to optim.SGD so that the optimizer knows which matrices should be modified during the update step. Also, we can specify a learning rate that controls the amount by which the parameters are modified.

## Train the model
We are now ready to train the model. We'll follow the same process to implement gradient descent:

Generate predictions

Calculate the loss

Compute gradients w.r.t the weights and biases

Adjust the weights by subtracting a small quantity proportional to the gradient

Reset the gradients to zero

The only change is that we'll work batches of data instead of processing the entire training data in every iteration. Let's define a utility function fit that trains the model for a given number of epochs.



In [31]:
def fit(num_epochs, model, loss_fn, train_dl):
    for epoch in range(num_epochs):
        for x, y in train_dl:
            pred = model(x)
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step() #Update parameters using gradients
            opt.zero_grad() # Reset the gradients to zero
        if(epoch+1)%20 == 0 :
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

In [32]:
fit(100, model, loss_fn, train_dl)

Epoch [20/100], Loss: 4.0778
Epoch [40/100], Loss: 1.7075
Epoch [60/100], Loss: 3.8061
Epoch [80/100], Loss: 1.8443
Epoch [100/100], Loss: 2.7912


In [29]:
model(torch.tensor([[75, 63, 44.]]))

tensor([[53.7124, 69.7163]], grad_fn=<AddmmBackward>)