In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

path_data = Path('../data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

# Model with linear layer

In [2]:
x_train.shape,y_train.shape

(torch.Size([50000, 784]), torch.Size([50000]))

In [3]:
n_hidden_activations = 50
n_features = x_train.shape[1]
n_labels = y_train.max() + 1

In [77]:
w_layer_1 = torch.randn(n_features,n_hidden_activations)
bias_layer_1 = torch.zeros(n_hidden_activations)
w_layer_2 = torch.randn(n_hidden_activations,1)
bias_layer_2 = torch.zeros(1)

In [5]:
a = torch.rand(7) -0.5
a

tensor([ 0.48,  0.29, -0.32, -0.30,  0.24,  0.08, -0.15])

In [6]:
def relu(x):
    return x.clamp_min(0.)

In [7]:
def linear(x,weight,bias):
    return x@weight + bias

In [12]:
def model(x_batch):
    l1 = linear(x_batch,w_layer_1,bias_layer_1)
    out1 = relu(l1)
    out = linear(out1,w_layer_2,bias_layer_2)
    return out

In [13]:
res = model(x_train)
res.shape

torch.Size([50000, 1])

# Loss function

In [14]:
(res - y_train[...,None]).shape

torch.Size([50000, 1])

In [22]:
def mse(predictions, target):
    return (predictions - target[...,None]).pow(2).mean()

In [15]:
def linear_grad(X,out,w,b):
    X.gradient = out.gradient@w.T
    w.gradient = X.T@out.gradient
    b.gradient = out.gradient.sum(0)

In [114]:
def forward_and_backward(X,Y):
    l1 = linear(X,w_layer_1,bias_layer_1)
    out1 = relu(l1)
    out = linear(out1,w_layer_2,bias_layer_2)
    diff = (out - Y[...,None])
    loss = diff.pow(2).mean
    
    # backward
    out.gradient = 2. * diff /X.shape[0]
    linear_grad(out1,out,w_layer_2,bias_layer_2)
    l1.gradient = (out1>0).float() * out1.gradient
    linear_grad(X,l1,w_layer_1,bias_layer_1)

In [115]:
forward_and_backward(x_train,y_train)

In [116]:
def get_grad(x):
    return x.gradient.clone()

In [117]:
chunks = w_layer_2,bias_layer_2,w_layer_1,bias_layer_1,x_train
grads = w2g,b2g,w1g,b1g,ig = tuple(map(get_grad,chunks))

In [118]:
def make_grad(x):
    return x.clone().requires_grad_(True)
ptgrads = w22,b22,w12,b12,xt2 = tuple(map(make_grad,chunks))

In [119]:
def forward(X,y):
    l1 = X@w12+b12
    out1 = relu(l1)
    out = out1@w22+b22
    return mse(out,y)

In [120]:
loss = forward(x_train,y_train)
loss.backward()

### Refactoring into classes

In [121]:
class Relu():
    def __call__(self,inp):
        self.inp = inp
        self.out = inp.clamp_min(0.0)
        return self.out
    def backward(self):
        self.inp.g = (self.inp>0.).float() * self.out.g 

In [122]:
class Linear():
    def __init__(self, w, b): self.w,self.b = w,b
    
    def __call__(self,inp):
        self.inp = inp
        self.out = inp@self.w + self.b
        return self.out
    def backward(self):
        self.inp.g = self.out.g@self.w.T
        self.w.g = self.inp.T@self.out.g
        self.b.g = self.out.g.sum(0)

In [123]:
class Mse():
    def __call__(self,inp,targ):
        self.inp = inp
        self.targ = targ
        self.out = mse(inp,targ)
        return self.out
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [124]:
class Model():
    def __init__(self,w1,b1,w2,b2):
        self.layers = [Linear(w1,b1),Relu(),Linear(w2,b2)]
        self.loss = Mse()
        
    def __call__(self,x,targ):
        for layer in self.layers:
            x = layer(x)
        return self.loss(x,targ)
    def backward(self):
        self.loss.backward()
        for layer in reversed(self.layers):
            layer.backward()

In [125]:
model = Model(w_layer_1, bias_layer_1, w_layer_2, bias_layer_2)

In [126]:
w_layer_1.shape

torch.Size([784, 50])

In [127]:
loss = model(x_train, y_train)

In [128]:
model.backward()

In [129]:
test_close(w2g, w_layer_2.g, eps=0.01)
test_close(b2g, bias_layer_2.g, eps=0.01)
test_close(w1g, w_layer_1.g, eps=0.01)
test_close(b1g, bias_layer_1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)