# Week 6: Gradient Descent

# Rasika Bhalerao

# Agenda

- Gradient descent in Pytorch

In [2]:
import numpy as np
import torch
from sklearn.metrics import f1_score

### Pytorch is powerful!
- Open source Python package for machine learning
- Easy to use with GPU (or CPU)
- Computation is done with tensors
- You can think of it as Numpy with extra capabilities


### Pytorch basics

In [3]:
# Some data
x = torch.tensor([3.])
y = torch.tensor([1.])

# Initialize some weights
a = torch.tensor([4.], requires_grad=True)
b = torch.tensor([5.], requires_grad=True)

# Define model and loss
y_hat = a * x + b
loss = (y - y_hat)**2

In [4]:
print(x)
print(y)
print(a)
print(b)
print(y_hat)
print(loss)

tensor([3.])
tensor([1.])
tensor([4.], requires_grad=True)
tensor([5.], requires_grad=True)
tensor([17.], grad_fn=<AddBackward0>)
tensor([256.], grad_fn=<PowBackward0>)


In [5]:
print(a.grad)
print(b.grad)
loss.backward() # this tells it to actually calculate the gradient
print(a.grad)
print(b.grad)

None
None
tensor([96.])
tensor([32.])


In [6]:
# This would be one iteration of updating the weights using the gradient
a = a - 0.01 * a.grad
b = b - 0.01 * b.grad
print(a)
print(b)

tensor([3.0400], grad_fn=<SubBackward0>)
tensor([4.6800], grad_fn=<SubBackward0>)


In [7]:
y_hat = a * x + b
loss = (y - y_hat)**2
print(y_hat) # 13.8, closer to the target 1 than it was before (17)
print(loss) # 163.8, smaller than it was before (256)

tensor([13.8000], grad_fn=<AddBackward0>)
tensor([163.8400], grad_fn=<PowBackward0>)


### Classification via (Stochastic) Gradient Descent in Pytorch

In [8]:
# Some data
x = torch.rand(50, 3)          # 50 rows, 3 features
y = torch.randint(0, 3, (50,)) # 50 rows, 3 possible categories

x_train = x[:40]
x_test = x[40:]
y_train = y[:40]
y_test = y[40:]

In [9]:
# DataLoader
batch_size = 5 # it will iterate through with batches of 5 rows

train_dataset = [[x_train[i], y_train[i]] for i in range(len(x_train))]
test_dataset = [[x_test[i], y_test[i]] for i in range(len(x_test))]

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size)

In [None]:
# First step of training: implementing epochs
epochs = 300 # maximum epochs
early_stop_epochs = 3 # stop iterating when loss doesn't decrease for 3 epochs
prev_loss = 9999999

nondecreasing = 0
for epoch in range(epochs):
    loss = torch.rand(1) # for now it is random - implement below!
    print(f'Epoch {epoch} loss: {loss.item()}')
    
    if prev_loss - loss <= 1e-3:
        nondecreasing += 1
    else:
        nondecreasing = 0
    
    if nondecreasing >= early_stop_epochs:
        break
        
    prev_loss = loss

Epoch 0 loss: 0.4107988476753235
Epoch 1 loss: 0.5948238968849182
Epoch 2 loss: 0.20232146978378296
Epoch 3 loss: 0.7460334300994873
Epoch 4 loss: 0.12138360738754272
Epoch 5 loss: 0.8712044358253479
Epoch 6 loss: 0.694099485874176
Epoch 7 loss: 0.45883089303970337
Epoch 8 loss: 0.985586941242218
Epoch 9 loss: 0.6581254005432129
Epoch 10 loss: 0.6086974740028381
Epoch 11 loss: 0.06593376398086548
Epoch 12 loss: 0.8101882338523865
Epoch 13 loss: 0.02086097002029419
Epoch 14 loss: 0.9142488837242126
Epoch 15 loss: 0.14593613147735596
Epoch 16 loss: 0.4454752802848816
Epoch 17 loss: 0.5019964575767517
Epoch 18 loss: 0.9360269904136658


In [None]:
# Second step of training: add batches

epochs = 300
early_stop_epochs = 3
prev_loss = 9999999

nondecreasing = 0
for epoch in range(epochs):
    
    #### start of code added for batches ####
    total_loss = 0
    for [batch_x, batch_y] in train_loader:
        
        batch_loss = 0
        for x, y in zip(batch_x, batch_y):
            batch_loss += torch.rand(1)
        total_loss += batch_loss
        # This is where we would do a backwards pass with batch_loss
        
    #### end of code added for batches ####

    print(f'Epoch {epoch} loss: {total_loss.item()}')
    
    if prev_loss - total_loss <= 1e-3:
        nondecreasing += 1
    else:
        nondecreasing = 0
    
    if nondecreasing >= early_stop_epochs:
        break
        
    prev_loss = total_loss

Epoch 0 loss: 21.56624412536621
Epoch 1 loss: 21.683334350585938
Epoch 2 loss: 22.559980392456055
Epoch 3 loss: 20.69846534729004
Epoch 4 loss: 21.97745132446289
Epoch 5 loss: 17.83310317993164
Epoch 6 loss: 19.787073135375977
Epoch 7 loss: 17.591415405273438
Epoch 8 loss: 18.504169464111328
Epoch 9 loss: 21.17566680908203
Epoch 10 loss: 18.670026779174805
Epoch 11 loss: 20.925670623779297
Epoch 12 loss: 16.878910064697266
Epoch 13 loss: 19.584733963012695
Epoch 14 loss: 16.968475341796875
Epoch 15 loss: 18.290822982788086
Epoch 16 loss: 19.382844924926758
Epoch 17 loss: 18.766212463378906
Epoch 18 loss: 18.053171157836914
Epoch 19 loss: 22.089073181152344
Epoch 20 loss: 20.175006866455078
Epoch 21 loss: 22.43926239013672
Epoch 22 loss: 15.905424118041992
Epoch 23 loss: 18.494518280029297
Epoch 24 loss: 20.5318603515625
Epoch 25 loss: 16.755041122436523
Epoch 26 loss: 18.540491104125977
Epoch 27 loss: 19.663827896118164
Epoch 28 loss: 19.87942123413086


In [None]:
# Third step of training: use actual model loss
lr = 0.01 # learning rate
weights = torch.nn.Linear(3, 3) # initialize weights for 3 features, 3 outputs
softmax_fn = torch.nn.Softmax(dim=1) # this returns a function that does softmax
loss_fn = torch.nn.functional.cross_entropy # loss function, takes input and target arrays, returns numerical loss
optimizer = torch.optim.SGD(weights.parameters(), lr=lr) # this will do SGD for us


epochs = 300
early_stop_epochs = 4
prev_loss = 9999999

nondecreasing = 0
for epoch in range(epochs):
    
    total_loss = 0
    for [batch_x, batch_y] in train_loader:
        
        #### start of code added for model ####
        optimizer.zero_grad() # forget gradients from previous iteration
        output = weights(batch_x) # multiplication part of model
        softmaxed_output = softmax_fn(output) # softmax part of model
        batch_loss = loss_fn(softmaxed_output, batch_y) # calculate loss
        
        total_loss += batch_loss # for record keeping
        
        batch_loss.backward() # calculate gradients
        optimizer.step() # do the step with lr

        #### end of code added for model ####
        
    print(f'Epoch {epoch} loss: {total_loss.item()}')
    
    if prev_loss - total_loss <= 1e-3:
        nondecreasing += 1
    else:
        nondecreasing = 0
    
    if nondecreasing >= early_stop_epochs:
        break
        
    prev_loss = total_loss

Epoch 0 loss: 8.959355354309082
Epoch 1 loss: 8.952441215515137
Epoch 2 loss: 8.945649147033691
Epoch 3 loss: 8.938977241516113
Epoch 4 loss: 8.932424545288086
Epoch 5 loss: 8.92598819732666
Epoch 6 loss: 8.91966724395752
Epoch 7 loss: 8.913461685180664
Epoch 8 loss: 8.907366752624512
Epoch 9 loss: 8.901384353637695
Epoch 10 loss: 8.89551067352295
Epoch 11 loss: 8.88974380493164
Epoch 12 loss: 8.88408374786377
Epoch 13 loss: 8.878527641296387
Epoch 14 loss: 8.873072624206543
Epoch 15 loss: 8.867718696594238
Epoch 16 loss: 8.862462997436523
Epoch 17 loss: 8.857304573059082
Epoch 18 loss: 8.852241516113281
Epoch 19 loss: 8.847271919250488
Epoch 20 loss: 8.842394828796387
Epoch 21 loss: 8.837606430053711
Epoch 22 loss: 8.832908630371094
Epoch 23 loss: 8.828295707702637
Epoch 24 loss: 8.823768615722656
Epoch 25 loss: 8.819324493408203
Epoch 26 loss: 8.814961433410645
Epoch 27 loss: 8.81067943572998
Epoch 28 loss: 8.806475639343262
Epoch 29 loss: 8.802349090576172
Epoch 30 loss: 8.798297882

In [None]:
# Last part of training: print F1 score every epoch
lr = 0.01
weights = torch.nn.Linear(3, 3) # 3 features, 3 outputs
softmax_fn = torch.nn.Softmax(dim=1) # this returns a function that does softmax
loss_fn = torch.nn.functional.cross_entropy # this is a function that takes parameters input and target
optimizer = torch.optim.SGD(weights.parameters(), lr=lr) # this will do SGD for us


epochs = 300
early_stop_epochs = 4
prev_loss = 9999999

nondecreasing = 0
for epoch in range(epochs):
    
    total_loss = 0
    y_pred = None
    for [batch_x, batch_y] in train_loader:
        
        optimizer.zero_grad()
        output = weights(batch_x)
        softmaxed_output = softmax_fn(output)
        batch_loss = loss_fn(softmaxed_output, batch_y)
        
        total_loss += batch_loss
        
        batch_loss.backward()
        optimizer.step()
        
        #### start of code added for F1 ####
        _, predicted_labels = torch.max(softmaxed_output.data, 1)
        if y_pred is not None:
            y_pred = torch.cat((y_pred, predicted_labels), 0)
        else:
            y_pred = predicted_labels
    
    f1 = f1_score(y_train, y_pred, average='micro')
    #### end of code added for F1 ####
    print(f'Epoch {epoch} loss: {total_loss.item()}, F1: {f1}')
    
    if prev_loss - total_loss <= 1e-5:
        nondecreasing += 1
    else:
        nondecreasing = 0
    
    if nondecreasing >= early_stop_epochs:
        break
        
    prev_loss = total_loss

Epoch 0 loss: 8.806382179260254, F1: 0.35
Epoch 1 loss: 8.801663398742676, F1: 0.325
Epoch 2 loss: 8.79702091217041, F1: 0.35
Epoch 3 loss: 8.792455673217773, F1: 0.35
Epoch 4 loss: 8.787962913513184, F1: 0.325
Epoch 5 loss: 8.783543586730957, F1: 0.325
Epoch 6 loss: 8.779196739196777, F1: 0.35
Epoch 7 loss: 8.774919509887695, F1: 0.35
Epoch 8 loss: 8.770710945129395, F1: 0.4000000000000001
Epoch 9 loss: 8.766570091247559, F1: 0.4000000000000001
Epoch 10 loss: 8.762496948242188, F1: 0.45
Epoch 11 loss: 8.7584867477417, F1: 0.4000000000000001
Epoch 12 loss: 8.754541397094727, F1: 0.4000000000000001
Epoch 13 loss: 8.75065803527832, F1: 0.4000000000000001
Epoch 14 loss: 8.746835708618164, F1: 0.4000000000000001
Epoch 15 loss: 8.743074417114258, F1: 0.375
Epoch 16 loss: 8.739371299743652, F1: 0.375
Epoch 17 loss: 8.735727310180664, F1: 0.375
Epoch 18 loss: 8.732138633728027, F1: 0.375
Epoch 19 loss: 8.728606224060059, F1: 0.375
Epoch 20 loss: 8.725129127502441, F1: 0.4000000000000001
Epoch

In [None]:
# Score on the test set
# Use the weights learned and stored in variable weights, run it on test set

with torch.no_grad(): # for speed - we no longer care about gradients
    total_loss = 0
    y_pred = None
    for [batch_x, batch_y] in test_loader:
        
        output = weights(batch_x)
        softmaxed_output = softmax_fn(output)
        batch_loss = loss_fn(softmaxed_output, batch_y)
        total_loss += batch_loss
        
        _, predicted_labels = torch.max(softmaxed_output.data, 1)
        if y_pred is not None:
            y_pred = torch.cat((y_pred, predicted_labels), 0)
        else:
            y_pred = predicted_labels
    
    f1 = f1_score(y_test, y_pred, average='micro')
    print(f'Test set loss: {total_loss.item()}, F1: {f1}')

Test set loss: 2.1902618408203125, F1: 0.5


### Questions

The above code implements a classifier (categorical output). How would we modify it to do regression (continuous output)?

You will need:
- MSE loss instead of softmax + cross-entropy loss
- Change the labels in y_train and y_test to numerical