<a href="https://colab.research.google.com/github/GregLed/fastai_2022/blob/main/fastai_lec13_14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F
from fastcore.test import test_close
from urllib.request import urlretrieve

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

In [2]:
MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'


if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

In [3]:
with gzip.open(path_gz, 'rb') as f:
     ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

# Initial setup

In [None]:
n,m = x_train.shape
c = y_train.max()+1
nh = 50

print(n, m)

50000 784


In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]

    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [None]:
model = Model(m, nh, 10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

# Cross entropy loss

In [None]:
pred.exp().sum(axis=-1, keepdim=True).shape

torch.Size([50000, 1])

In [None]:
prob = (pred.exp() / pred.exp().sum(axis=-1, keepdim=True))
prob.shape

torch.Size([50000, 10])

In [None]:
log_prob = prob.log()
log_prob

tensor([[-2.37, -2.49, -2.36,  ..., -2.31, -2.28, -2.22],
        [-2.37, -2.44, -2.44,  ..., -2.27, -2.26, -2.16],
        [-2.48, -2.33, -2.28,  ..., -2.30, -2.30, -2.27],
        ...,
        [-2.33, -2.52, -2.34,  ..., -2.31, -2.21, -2.16],
        [-2.38, -2.38, -2.33,  ..., -2.29, -2.26, -2.17],
        [-2.33, -2.55, -2.36,  ..., -2.29, -2.27, -2.16]], grad_fn=<LogBackward0>)

In [None]:
def log_softmax(x):
    return (x.exp()/(x.exp().sum(-1,keepdim=True))).log()

In [None]:
# we can use logarithm property to split division into deduction
def log_softmax(x):
    return x - x.exp().sum(-1,keepdim=True).log()

In [None]:
# denominator can be problematic because of floating numbers are not stable
# for big numbers. We can find max for each row and deduct it from other vals
def logsumexp(x):
    m = x.max(-1).values
    return m + (x-m[:,None]).exp().sum(-1).log()

In [None]:
pred.max(-1).values

tensor([0.10, 0.14, 0.21,  ..., 0.14, 0.11, 0.14], grad_fn=<MaxBackward0>)

In [None]:
def log_softmax(x):
    return x - x.logsumexp(-1,keepdim=True)

In [None]:
test_close(logsumexp(pred), pred.logsumexp(-1))
log_sm_pred = log_softmax(pred)
log_sm_pred

tensor([[-2.37, -2.49, -2.36,  ..., -2.31, -2.28, -2.22],
        [-2.37, -2.44, -2.44,  ..., -2.27, -2.26, -2.16],
        [-2.48, -2.33, -2.28,  ..., -2.30, -2.30, -2.27],
        ...,
        [-2.33, -2.52, -2.34,  ..., -2.31, -2.21, -2.16],
        [-2.38, -2.38, -2.33,  ..., -2.29, -2.26, -2.17],
        [-2.33, -2.55, -2.36,  ..., -2.29, -2.27, -2.16]], grad_fn=<SubBackward0>)

# Cross entropy

In [None]:
-log_sm_pred[range(y_train.shape[0]), y_train].mean()

tensor(2.30, grad_fn=<NegBackward0>)

In [None]:
# Note that original math formula uses sum and we use mean to conrol the scale.
# Mathematically both have the same max or min and sum is just divided by a
# constant which is n-examples (targets.shape[0])
def nll(log_prob, targets):
    return -log_prob[range(targets.shape[0]), targets].mean()

loss = nll(log_sm_pred, y_train)
loss

tensor(2.30, grad_fn=<NegBackward0>)

In [None]:
# Pytorch implementation
test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3)
test_close(F.cross_entropy(pred, y_train), loss, 1e-3)

# Basic training loop

In [None]:
loss_func = F.cross_entropy

In [None]:
bs=50                  # batch size
lr = 0.5               # learning rate
epochs = 3             # how many epochs to train for
n = y_train.shape[0]

xb = x_train[0:bs]     # a mini-batch from x
yb = y_train[0:bs]

preds = model(xb)      # predictions
preds[0], preds.shape

(tensor([-0.09, -0.21, -0.08,  0.10, -0.04,  0.08, -0.04, -0.03,  0.01,  0.06], grad_fn=<SelectBackward0>),
 torch.Size([50, 10]))

In [None]:
loss_func(preds, yb)

tensor(2.30, grad_fn=<NllLossBackward0>)

In [None]:
# To get prediction for each example we can use argmax across columns
preds.argmax(dim=1)

tensor([3, 9, 3, 8, 5, 9, 3, 9, 3, 9, 5, 3, 9, 9, 3, 9, 9, 5, 8, 7, 9, 5, 3, 8, 9, 5, 9, 5, 5, 9, 3, 5, 9, 7, 5, 7, 9, 9, 3, 9, 3, 5, 3, 8,
        3, 5, 9, 5, 9, 5])

In [None]:
def accuracy(out, yb): return (out.argmax(dim=1)==yb).float().mean()
def report(loss, preds, yb): print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')

accuracy(preds, yb)

tensor(0.08)

In [None]:
report(loss_func(preds, yb), preds, yb)

2.30, 0.08


In [None]:
model.layers

[Linear(in_features=784, out_features=50, bias=True),
 ReLU(),
 Linear(in_features=50, out_features=10, bias=True)]

In [None]:
model = Model(m, nh, 10)

for _ in range(epochs):
    for i in range(0, n, bs):
        # get slice of data
        s = slice(i, min(i+bs,n))
        xb = x_train[s]
        yb = y_train[s]

        # calculate predictions, loss and gradients
        pred = model(xb)
        loss = loss_func(pred, yb)
        loss.backward()

        # update weights of the model
        with torch.no_grad():
            for layer in model.layers:
                if hasattr(layer, 'weight'):
                    layer.weight -= lr * layer.weight.grad
                    layer.bias -= lr * layer.bias.grad
                    layer.weight.grad.zero_()
                    layer.bias.grad.zero_()

    report(loss, pred, yb)

0.13, 0.96
0.08, 0.98
0.09, 0.98


In [None]:
pred.shape

torch.Size([50, 10])

# DataLoader and Sampler

In [None]:
a = [(1,2), (3,4), (5,6), (7,8)]
x, y = zip(*a)


In [None]:
x, y

((1, 3, 5, 7), (2, 4, 6, 8))

In [None]:
x = [torch.tensor(i) for i in x]
x

[tensor(1), tensor(3), tensor(5), tensor(7)]

In [None]:
torch.stack(x)

tensor([1, 3, 5, 7])

# zip vs zip_longest

In [None]:
a = (1,2,3,4)
b = (5,6,7)

list(zip(a,b))


[(1, 5), (2, 6), (3, 7)]

In [None]:
from itertools import zip_longest

In [None]:
list(zip_longest(a, b))

[(1, 5), (2, 6), (3, 7), (4, None)]

In [None]:
! ls ./sample_data/

anscombe.json		     california_housing_train.csv  mnist_train_small.csv
california_housing_test.csv  mnist_test.csv		   README.md
