In [1]:
#from exp.helper_methods import *
#from exp.data_loader import *
#from exp.neural import *

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In Command mode press `h` to see all shortcuts

## data_loader

In [3]:
# %load_ext autoreload
# %autoreload 2
# %matplotlib inline

In [4]:
#export
from fastai import datasets
import pickle
import gzip
from torch import tensor

In [5]:
#export
urls = {"MNIST_URL" : 'http://deeplearning.net/data/mnist/mnist.pkl'}

In [6]:
#export
url = urls["MNIST_URL"]
def get_data():
    path = datasets.download_data(url, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

In [7]:
#export
def normalize(x, m, s): return (x-m)/s

In [8]:
#!python notebook2script.py data_loader.ipynb

## Neural

In [9]:
# %load_ext autoreload
# %autoreload 2
# %matplotlib inline

In [137]:
#export
class Neural():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)

In [138]:
#export
class Relu(Neural):
    def forward(self, inp): return inp.clamp_min(0.)-0.5
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [139]:
#export
class Lin(Neural):
    def __init__(self, w, b): self.w,self.b = w,b
        
    def forward(self, inp): return inp@self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = inp.t() @ out.g
        #self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [140]:
#export
class Mse(Neural):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum_{0 \leq j \leq n-1} e^{x_{j}}}$$ 

Cross entropy loss

$$ -\sum x\, \log p(x) $$

 There is a way to compute the log of the sum of exponentials in a more stable way, called the [LogSumExp trick](https://en.wikipedia.org/wiki/LogSumExp). The idea is to use the following formula:

$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$

In [141]:
#export
# class LogSoftmax(Neural):
    
#     def logsumexp(inp):
#         m = inp.max(-1)[0]
#         return m + (inp-m[:,None]).exp().sum(-1).log()
    
#     def forward(self, inp): return inp - inp.logsumexp(-1,keepdim=True)
    
#     def bwd(self, out, inp): 
#         raise Exception('LogSoftmax bwd not yet implemented')
#         inp.g = (inp>0).float() * out.g
        
    

In [170]:
class SoftmaxCrossEntropy(Neural):
    
    def softmax_forward(self, inp): 
        m = inp.max(-1)[0]
        exps = (inp-m[:, None]).exp()
        return exps / exps.sum(-1,keepdim=True)
    
    def cross_entropy_forward(self, inp, target):
         return -inp[range(targ.shape[0]), targ].mean()
    
    def forward(self, inp_softmax, targ):
        inp = self.softmax_forward(inp_softmax)
        return cross_entropy_forward(self, inp, target)
    
    def bwd(self, out, inp, targ): 
        print(out.shape, out[:5])
        print(inp.shape, inp[:5])
        print(targ.shape, targ[:5])
        raise Exception("Not yet implemented")
        one_hot = torch.zeros()
        one_hot[torch.arange(4), labels] = 1
        inp.g = (out-y[:, None])/ targ.shape[0]

In [171]:
#!python notebook2script.py neural.ipynb

## Model

### Origin model

In [172]:
#export
import math
import torch

In [146]:
x_train,y_train,x_valid,y_valid = get_data()

In [147]:
train_mean,train_std = x_train.mean(),x_train.std()
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [148]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

In [149]:
# num hidden
nh = 50

In [150]:
# simplified kaiming init / he init
#784, 50
w1 = torch.randn(m,nh)*math.sqrt(2./m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)*math.sqrt(2./nh)
b2 = torch.zeros(1)

In [151]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [152]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [153]:
%time loss = model(x_train, y_train); loss

CPU times: user 129 ms, sys: 863 µs, total: 130 ms
Wall time: 25.6 ms


tensor(29.0779)

In [154]:
%time model.backward()

CPU times: user 210 ms, sys: 37.9 ms, total: 248 ms
Wall time: 43.6 ms


In [155]:
learning_rate = 0.0001
epochs = 5

In [156]:
for epoch in range(epochs):
    parameters = [w1,b1,w2,b2]
    for parameter in parameters:
        parameter-=learning_rate*parameter.g
    loss = model(x_train, y_train)
    print(loss)

tensor(27.3300)
tensor(25.6805)
tensor(24.1284)
tensor(22.6724)
tensor(21.3106)


### Softmax model

In [173]:
#export
import math
import torch

In [174]:
x_train,y_train,x_valid,y_valid = get_data()

In [175]:
train_mean,train_std = x_train.mean(),x_train.std()
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [176]:
n,m = x_train.shape
n,m = x_train.shape
c = y_train.max()+1
# num hidden
nh = 50

In [177]:
# simplified kaiming init / he init
#784, 50
w1 = torch.randn(m,nh)*math.sqrt(2./m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,10)*math.sqrt(2./nh)
b2 = torch.zeros(1)

In [178]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = SoftmaxCrossEntropy()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def forward(self, x):
        for l in self.layers: x = l(x)
        return x
    
    def backward(self, y_train):
        self.layers[-1].backward(y_train)
        for l in reversed(self.layers[:-1]): l.backward()
    
    def accuracy(self, pred, yb): return (torch.argmax(pred, dim=1)==yb).float().mean()
    

In [179]:
a =[1, 2, 3 ,4]
a[:-1]

[1, 2, 3]

In [164]:
w1.g,b1.g,w2.g,b2.g = [None]*4

In [165]:
model = Model()

In [166]:
pred = model.forward(x_train); pred.shape

torch.Size([50000, 10])

In [167]:
loss = model(x_train, y_train); loss

tensor(-0.0957)

In [168]:
y_train.shape

torch.Size([50000])

In [117]:
model.backward(y_train)

TypeError: backward() takes 1 positional argument but 2 were given

In [None]:
model.accuracy(pred, y_train)

In [84]:
bs = 10000    # batch size
lr = 0.5   # learning rate
epochs = 1 # how many epochs to train for

In [85]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        loss = model(xb, yb)
        model.backward()
        
        parameters = [w1,b1,w2,b2]
        for parameter in parameters:
            parameter-=learning_rate*parameter.g
            
        
        loss = model(x_train, y_train)
        sm_pred = model.forward(x_train)
        print()
        print("accuracy", accuracy(x_train, y_train))
        print("loss", loss)
        

TypeError: bwd() argument after ** must be a mapping, not tuple