# Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt

In [2]:
import sys
sys.path.append('../')
from exp.nb_01 import *


def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid),
         _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train, y_train, x_valid, y_valid))


def normalize(x, m, s):
    return (x - m) / s


def test_near_zero(a, tol=1e-3):
    assert a.abs() < tol, f"Near zero: {a}"


def mse(output, targ):
    return (output.squeeze(-1) - targ).pow(2).mean()

# Loss

In [3]:
def softmax(x):
    return x.exp() / x.exp().sum(-1, keepdim=True)

In [4]:
def log_softmax(x):
    return x - x.exp().sum(-1, keepdim=True).log()

In [5]:
def log_softmax(x):
    a = x.max(-1)[0]
    return x - (a + (x - a[..., None]).exp().sum(-1).log())

In [6]:
def nll(inp, target):
    return - inp[range(len(inp)), target].mean()

In [7]:
a = torch.tensor([[1, 2], [4, 3]])
a.argmax(dim=1)

tensor([1, 0])

In [8]:
def accuracy(out, target):
    return (out.argmax(dim=1) == target).float().mean()

# Model

In [9]:
class DummyModel:
    def __init__(self, n_in, nh, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)

    def __setattr__(self, k, v):
        if not k.startswith('_'):
            self._modules[k] = v
        super().__setattr__(k, v)

    def __repr__(self):
        return f'{self._modules}'

    def parameters(self):
        for l in self._modules.values():
            for p in l.parameters():
                yield p

In [10]:
model = DummyModel(10, 50, 1)
model

{'l1': Linear(in_features=10, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=1, bias=True)}

In [11]:
list(model.parameters())

[Parameter containing:
 tensor([[ 2.0481e-01, -3.5641e-02, -2.8826e-01, -2.4427e-01, -2.5293e-01,
           5.5174e-02,  1.5537e-01,  7.5775e-02, -1.0442e-02, -8.7140e-02],
         [ 8.3952e-02, -3.0404e-01, -5.0348e-02, -1.8344e-01, -1.7903e-01,
          -6.9925e-02,  2.3370e-01,  1.8883e-01,  7.2590e-02,  1.4053e-01],
         [ 1.4514e-01,  1.3608e-01,  2.3757e-01,  2.5917e-01,  6.9273e-02,
          -4.6451e-02, -6.7264e-04, -1.1190e-01, -9.1599e-02,  1.3182e-01],
         [ 5.4460e-02, -1.3481e-02,  2.1853e-01, -1.4235e-01, -1.2561e-01,
          -2.4741e-01,  1.2656e-01,  1.4360e-01, -1.7623e-01, -2.6897e-03],
         [ 1.6779e-01, -1.0617e-01,  1.8345e-01,  1.6497e-01,  1.1530e-01,
           2.6983e-01, -3.1355e-01,  3.0751e-01, -2.0166e-01, -1.4336e-01],
         [-1.1560e-01,  5.0659e-03, -2.1951e-01, -1.5410e-02,  2.6934e-01,
           3.1324e-01,  1.4645e-02, -1.9744e-01,  1.9808e-01, -3.0895e-01],
         [ 2.7753e-01, -2.3856e-01,  7.3735e-02,  7.0507e-02, -2.7652e-

In [12]:
[p.shape for p in model.parameters()]

[torch.Size([50, 10]), torch.Size([50]), torch.Size([1, 50]), torch.Size([1])]

In [13]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)

    def __call__(self, x):
        return self.l2(F.relu(self.l1(x)))

In [14]:
model = Model(10, 50, 1)
model

Model(
  (l1): Linear(in_features=10, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=1, bias=True)
)

In [15]:
list(model.named_parameters())

[('l1.weight',
  Parameter containing:
  tensor([[ 2.4020e-01,  8.9872e-02,  2.2179e-01, -4.9403e-02,  6.0188e-02,
           -8.2237e-02, -2.5875e-01,  5.8769e-02, -2.9027e-01, -1.1395e-01],
          [-1.0626e-01, -2.9756e-01,  8.2289e-02, -1.1705e-01, -8.8802e-02,
            3.7548e-03,  6.5087e-03, -3.0153e-01, -6.0792e-02, -2.6864e-01],
          [-9.5636e-02,  1.2715e-01, -2.9852e-01, -2.4876e-01, -2.4315e-01,
           -2.4582e-01, -2.1988e-01, -3.2580e-02,  2.0303e-01,  1.9304e-02],
          [ 2.5258e-01, -2.8663e-02,  1.7346e-01, -2.8764e-01, -9.1553e-02,
            5.6161e-02, -2.5793e-02, -3.1943e-02,  8.6642e-02, -2.5092e-01],
          [-1.8698e-01, -1.2135e-01,  9.9211e-02,  1.1476e-01,  2.8158e-01,
           -1.7823e-01, -7.1022e-02,  2.6572e-01,  2.4877e-01,  2.1743e-01],
          [-1.5481e-01,  2.2618e-02,  9.4337e-02,  1.5219e-01,  1.6154e-01,
            9.7116e-02,  1.5017e-01,  2.6325e-01,  3.8464e-02,  2.9264e-01],
          [-1.6648e-01, -3.0528e-01,  2.113

In [16]:
list(model.named_children())

[('l1', Linear(in_features=10, out_features=50, bias=True)),
 ('l2', Linear(in_features=50, out_features=1, bias=True))]

In [17]:
list(model.named_modules())

[('',
  Model(
    (l1): Linear(in_features=10, out_features=50, bias=True)
    (l2): Linear(in_features=50, out_features=1, bias=True)
  )),
 ('l1', Linear(in_features=10, out_features=50, bias=True)),
 ('l2', Linear(in_features=50, out_features=1, bias=True))]

In [18]:
model.l1

Linear(in_features=10, out_features=50, bias=True)

In [19]:
[k[0] for k in list(model.l1.named_parameters())]

['weight', 'bias']

In [20]:
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i, layer in enumerate(self.layers):
            self.add_module(f'layer_{i}', layer)

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [21]:
layers = [nn.Linear(10, 50), nn.Linear(50, 1)]
model = Model(layers)
model

Model(
  (layer_0): Linear(in_features=10, out_features=50, bias=True)
  (layer_1): Linear(in_features=50, out_features=1, bias=True)
)

In [22]:
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [23]:
model = Model(layers)
model

Model(
  (layers): ModuleList(
    (0): Linear(in_features=10, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [24]:
nn.Sequential(*layers)

Sequential(
  (0): Linear(in_features=10, out_features=50, bias=True)
  (1): Linear(in_features=50, out_features=1, bias=True)
)

# Optimizer

In [25]:
class optimizer:
    def __init__(self, parameters, lr):
        self.parameters = list(parameters)
        self.lr = lr

    def step(step):
        with torch.no_grad():
            for p in self.parameters:
                p -= lr * p.grad

    def zero_grad(self):
        for p in self.parameters:
            p.grad.data.zero_()

# Dataset & Dataloader

In [26]:
class Dataset:
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, i):
        return self.x[i], self.y[i]

In [27]:
x_train, y_train, x_valid, y_valid = get_data()

In [28]:
train_ds = Dataset(x_train, y_train)
valid_ds = Dataset(x_valid, y_valid)

In [29]:
assert len(train_ds) == x_train.shape[0]
assert len(valid_ds) == x_valid.shape[0]

In [30]:
xs, ys = train_ds[:5]
xs, ys

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([5, 0, 4, 1, 9]))

In [31]:
class Dataloader:
    def __init__(self, ds, bs):
        self.ds = ds
        self.bs = bs

    def __iter__(self):
        for i in range(0, len(self.ds), self.bs):
            yield self.ds[i:i + self.bs]

In [32]:
train_dl = Dataloader(train_ds, 20)
valid_dl = Dataloader(valid_ds, 20)

In [33]:
next(iter(train_dl))

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9]))

In [34]:
class Sampler:
    def __init__(self, ds, bs, shuffle=True):
        self.ds = ds
        self.bs = bs
        self.shuffle = shuffle

    def __iter__(self):
        self.idxs = torch.randperm(len(
            self.ds)) if self.shuffle else torch.arange(len(self.ds))
        for i in range(0, len(self.ds), self.bs):
            yield self.idxs[i:i + self.bs]

In [35]:
train_ds[:5]

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([5, 0, 4, 1, 9]))

In [36]:
train_sampler = Sampler(train_ds, 5, shuffle=True)
[o for o in train_sampler][:10]

[tensor([34983, 49936, 33344, 35268, 32597]),
 tensor([34399, 44867, 36841, 41903, 36888]),
 tensor([28704, 29728, 10027, 34329, 35618]),
 tensor([23584,  6719, 33730, 49827, 25957]),
 tensor([35690, 49254, 31749, 25510, 41645]),
 tensor([22681, 32009, 39069, 43777,  1270]),
 tensor([29216, 40426, 31903,  4805, 24899]),
 tensor([35631, 39481,  7338, 15050, 14597]),
 tensor([13889,   856, 28100,  2532,  9595]),
 tensor([ 8392,  2059, 12083, 16855,  2615])]

In [37]:
train_sampler = Sampler(train_ds, 5, shuffle=False)
[o for o in train_sampler][:10]

[tensor([0, 1, 2, 3, 4]),
 tensor([5, 6, 7, 8, 9]),
 tensor([10, 11, 12, 13, 14]),
 tensor([15, 16, 17, 18, 19]),
 tensor([20, 21, 22, 23, 24]),
 tensor([25, 26, 27, 28, 29]),
 tensor([30, 31, 32, 33, 34]),
 tensor([35, 36, 37, 38, 39]),
 tensor([40, 41, 42, 43, 44]),
 tensor([45, 46, 47, 48, 49])]

In [38]:
def collate(b):
    xs, ys = zip(*b)
    return torch.stack(xs), torch.stack(ys)

In [39]:
class Dataloader:
    def __init__(self, ds, sampler, bs, collate_fn=collate):
        self.ds = ds
        self.bs = bs
        self.sampler = sampler
        self.collate = collate_fn

    def __iter__(self):
        for s in sampler:
            yield self.collate([self.ds[i] for i in s])

In [40]:
from torch.utils.data import RandomSampler, SequentialSampler

In [41]:
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        # Handle batchnorm / dropout
        model.train()
#         print(model.training)
        for xb,yb in train_dl:
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()
            opt.zero_grad()

        model.eval()
#         print(model.training)
        with torch.no_grad():
            tot_loss, tot_acc = 0., 0.
            for xb, yb in valid_dl:
                pred = model(xb)
                tot_loss += loss_func(pred, yb) * len(xb)
                tot_acc  += accuracy (pred,yb) * len(xb)
        nv = len(valid_ds)
        print(epoch, tot_loss / nv, tot_acc / nv)
    return tot_loss / nv, tot_acc / nv

In [42]:
layers = [nn.Linear(x_train.shape[1], 50), nn.ReLU(), nn.Linear(50, 10)]
model = Model(layers)
model

Model(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [43]:
fit(2, model, F.cross_entropy, opt.SGD(model.parameters(), lr=.5), train_dl, valid_dl)

0 tensor(0.2036) tensor(0.9387)
1 tensor(0.1888) tensor(0.9452)


(tensor(0.1888), tensor(0.9452))

# Learner

In [44]:
from torch.utils.data import DataLoader

In [45]:
class DataBunch:
    def __init__(self, train_dl, valid_dl, c=None):
        self.train_dl = train_dl
        self.valid_dl = valid_dl
        self.c = c
    
    @property
    def train_ds(self):
        return self.train_dl.dataset
    
    @property
    def valid_ds(self):
        return self.valid_dl.dataset

In [46]:
def get_dls(train_ds, valid_ds, bs, **kwargs):
    return (DataLoader(train_ds, batch_size=bs, shuffle=True, **kwargs),
            DataLoader(valid_ds, batch_size=bs * 2, shuffle=False, **kwargs))

In [47]:
bs = 64
nh = 50
c = y_train.max().item() + 1
c

10

In [48]:
data = DataBunch(*get_dls(train_ds, valid_ds, bs))
assert len(train_ds) == len(data.train_ds)
assert len(valid_ds) == len(data.valid_ds)

In [49]:
class Learner:
    def __init__(self, model, opt, loss_func, data):
        self.model = model
        self.opt = opt
        self.loss_func = loss_func
        self.data = data

In [50]:
layers = [nn.Linear(x_train.shape[1], nh), nn.ReLU(), nn.Linear(50, c)]

In [51]:
def get_model(layers, lr):
    model = nn.Sequential(*layers)
    optimizer = opt.SGD(model.parameters(), lr=lr)
    return model, optimizer

In [52]:
learner = Learner(*get_model(layers, lr=.5), 
                  F.cross_entropy,
                  data)

In [53]:
def fit(epochs, learner):
    '''Fit learner.model'''
    for epoch in range(epochs):
        learner.model.train()
        for xb, yb in learner.data.train_dl:
            preds = learner.model(xb)
            loss = learner.loss_func(preds, yb)
            loss.backward()
            learner.opt.step()
            learner.opt.zero_grad()

        learner.model.eval()
        with torch.no_grad():
            tot_acc, tot_loss = 0., 0.
            for xb, yb in learner.data.valid_dl:
                preds = learner.model(xb)
                loss = learner.loss_func(preds, yb)
                tot_acc += (accuracy(preds, yb) * len(xb))
                tot_loss += (loss * len(xb))
        n_valid = len(learner.data.valid_ds)
        print(
            f'Epoch {epoch} : Loss = {tot_loss / n_valid:.2}, Accuracy = {tot_acc / n_valid:.2%}'
        )

In [54]:
fit(4, learner)

Epoch 0 : Loss = 0.16, Accuracy = 95.22%
Epoch 1 : Loss = 0.13, Accuracy = 96.43%
Epoch 2 : Loss = 0.19, Accuracy = 93.73%
Epoch 3 : Loss = 0.11, Accuracy = 96.61%
