This is a minimal replication o the 02_fully_connected notebook as "assigned" on the deep learning from the foundations MOOC from fast.ai

### Read and normalize data

In [1]:
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor

MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'

In [2]:
data_path = datasets.download_data(url=MNIST_URL, fname="C:/Users/luizg/Documents/dl-projects/fast_ai_dl2/data/mninst.pkl", ext=".gz")

with gzip.open(data_path, "rb") as f: 
    ((x_train, y_train), (x_test, y_test), _) = pickle.load(f, encoding="latin-1")
    
x_train, y_train, x_test, y_test = map(tensor, (x_train, y_train, x_test, y_test))

In [3]:
x_train.shape

torch.Size([50000, 784])

In [4]:
x_test.shape

torch.Size([10000, 784])

In [5]:
avg = x_train.mean()
std = x_train.std()
def normalize(x, avg, std): 
    x_n = (x - avg) / std
    
    return x_n

In [6]:
x_train = normalize(x_train, avg, std)
x_test = normalize(x_test, avg, std)

In [7]:
x_train.mean()

tensor(-7.6999e-06)

In [8]:
x_train.std()

tensor(1.)

### Basic architecture

In [9]:
nh = 50
r, c = x_train.shape

#### Initializing the hidden layers with kaiming initialization

In [10]:
w1 = torch.randn(c, nh)*math.sqrt(2/c)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)*math.sqrt(2/nh)
b2 = torch.zeros(1)

In [11]:
w1.shape

torch.Size([784, 50])

In [12]:
w2.shape

torch.Size([50, 1])

In [13]:
w1.mean()

tensor(-0.0001)

In [14]:
w2.mean()

tensor(-0.0274)

In [15]:
w1.std()

tensor(0.0503)

#### Linear pass

In [16]:
def linear(x, w, b): 
    return x@w + b

In [17]:
l1 = linear(x_train, w1, b1)

In [18]:
l1

tensor([[ 0.9047, -1.3058, -0.4775,  ...,  1.1861, -0.8319, -0.0486],
        [ 0.6392,  0.8472, -1.3464,  ...,  1.1617, -1.0857, -1.8533],
        [ 0.4398, -0.2592, -0.2089,  ...,  1.3682,  0.9762,  3.1267],
        ...,
        [ 0.9223, -0.2946,  0.3793,  ..., -0.2602, -0.7335,  0.2958],
        [-0.0033, -0.0175, -0.6370,  ...,  1.5102, -1.2366, -0.3035],
        [-1.3531, -1.6820, -1.2544,  ..., -0.8238, -0.2311,  0.1895]])

In [19]:
l1.shape

torch.Size([50000, 50])

#### Relu activation

In [20]:
def relu(x): 
    return torch.clamp(x, min=0)

In [21]:
l1_relu = relu(l1)

In [22]:
l1_relu

tensor([[0.9047, 0.0000, 0.0000,  ..., 1.1861, 0.0000, 0.0000],
        [0.6392, 0.8472, 0.0000,  ..., 1.1617, 0.0000, 0.0000],
        [0.4398, 0.0000, 0.0000,  ..., 1.3682, 0.9762, 3.1267],
        ...,
        [0.9223, 0.0000, 0.3793,  ..., 0.0000, 0.0000, 0.2958],
        [0.0000, 0.0000, 0.0000,  ..., 1.5102, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.1895]])

#### Creating a simple model

In [23]:
def model(x, w1, b1, w2, b2): 
    l1 = linear(x, w1, b1)
    l1_relu = relu(l1)
    l2 = linear(l1_relu, w2, b2)
    return l2

In [24]:
%timeit -n 10 _=model(x_train, w1, b1, w2, b2)

54.1 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
l2 = model(x_train, w1, b1, w2, b2)

In [26]:
l2.shape

torch.Size([50000, 1])

In [27]:
l2

tensor([[-2.4830],
        [-1.8173],
        [-1.0738],
        ...,
        [-0.2930],
        [-0.0398],
        [ 0.0679]])

### MSE

In [28]:
y_train

tensor([5, 0, 4,  ..., 8, 4, 8])

In [29]:
y_train.shape

torch.Size([50000])

In [30]:
l2.shape

torch.Size([50000, 1])

In [31]:
l2.squeeze(-1)

tensor([-2.4830, -1.8173, -1.0738,  ..., -0.2930, -0.0398,  0.0679])

In [37]:
y_train = y_train.type(torch.FloatTensor)

In [38]:
y_train

tensor([5., 0., 4.,  ..., 8., 4., 8.])

In [39]:
#export
def mse(x, y): 
    return (x.squeeze(-1) - y).pow(2).mean()

In [40]:
mse(l2, y_train)

tensor(35.1839)

### Gradients and backwards pass

In [55]:
def mse_gradient(x, y): 
    x.g = 2. * (x.squeeze(-1) - y) / y.shape[0]

In [56]:
def relu_gradient(x, y): 
    x.g = (x > 0).float() * y.g

In [57]:
def linear_gradient(x, y, w, b): 
    x.g = y.g @ w.t()
    w.g = (x.unsqueeze(-1) * y.g.unsqueeze(1)).sum(0)
    b.g = y.g.sum(0)

In [58]:
def forward_backwards(x, y, w1, b1, w2, b2): 
    # Forwards
    l1 = linear(x, w1, b1)
    l1_relu = relu(l1)
    out = linear(l1_relu, w2,  b2)
    loss = mse(out, y)
    
    # Backwards
    mse_gradient(out, y)
    linear_gradient(l2, out, w2, b2)
    relu_gradient(l1, l1_relu)
    linear_gradient(x, l1, w1, b1)

In [59]:
l1_relu.unsqueeze(-1).shape

torch.Size([50000, 50, 1])

In [60]:
w2.t().shape

torch.Size([1, 50])

In [61]:
l2.shape

torch.Size([50000, 1])

In [62]:
forward_backwards(x_train, y_train, w1, b1, w2, b2)

RuntimeError: size mismatch, m1: [1 x 50000], m2: [1 x 50] at C:\w\1\s\tmp_conda_3.7_055457\conda\conda-bld\pytorch_1565416617654\work\aten\src\TH/generic/THTensorMath.cpp:752

## Refactor model

### Layers as classes

In [63]:
class Relu(): 
    def __call__(self, x): 
        self.x = x
        self.y = torch.clamp(self.x, min=0)
        return self.y
    
    def backwards(self): 
        self.x.g = (self.x > 0).float() * self.y.g

In [73]:
class Linear(): 
    def __init__(self, w, b): 
        self.w = w
        self.b = b
    
    def __call__(self, x): 
        self.x = x
        self.y = x@self.w + self.b
        return self.y
    
    def backwards(self): 
        self.x.g = self.y.g @ self.w.t()
        self.w.g = (self.x.unsqueeze(-1) * self.y.g.unsqueeze(1)).sum(0)
        self.b.g = self.y.g.sum(0)

In [76]:
class MSE(): 
    def __call__(self, x, y): 
        self.x = x
        self.y = y
        self.loss = (x.squeeze(-1) - y).pow(2).mean()
        return self.loss
    
    def backwards(self): 
        self.x.g = 2. * (self.x.squeeze(-1) - self.y) / self.y.shape[0]

In [82]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Linear(w1, b1), Relu(), Linear(w2, b2)]
        self.loss = MSE()
        
    def __call__(self, x, y):
        for l in self.layers: x = l(x)
        return self.loss(x, y)
    
    def backward(self):
        self.loss.backwards()
        for l in reversed(self.layers): l.backwards()

In [83]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model(w1, b1, w2, b2)

In [84]:
%time loss = model(x_train, y_train)

Wall time: 60 ms


In [85]:
loss

tensor(35.1839)

In [None]:
%time model.backward()

### nn.Linear and nn.Module

In [None]:
#export
from torch import nn

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x.squeeze(), targ)

In [None]:
model = Model(m, nh, 1)

In [None]:
%time loss = model(x_train, y_train)

CPU times: user 85.1 ms, sys: 8.16 ms, total: 93.3 ms
Wall time: 46.3 ms


In [None]:
%time loss.backward()

CPU times: user 135 ms, sys: 78.1 ms, total: 213 ms
Wall time: 71.1 ms


## Export

In [None]:
!./notebook2script.py 02_fully_connected.ipynb

Converted 02_fully_connected.ipynb to nb_02.py
