# The Forward and Backward Passes

In [62]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
from exp.nb_matmul import *
import torch.nn
import math

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
#export
from exp.nb_matmul import *
import torch.nn
import math

### Loading Data and Normalization

In [106]:
x_train, y_train, x_valid, y_valid = get_data()

In [107]:
x_train, y_train, x_valid, y_valid = map(tensor, (x_train, y_train, x_valid, y_valid))

In [108]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

(torch.Size([50000, 784]),
 torch.Size([50000]),
 torch.Size([10000, 784]),
 torch.Size([10000]))

In [9]:
#export
def normalize(x, mean, std): return (x - mean)/std

In [141]:
x_train = normalize(x_train, x_train.mean(), x_train.std())
x_valid = normalize(x_valid, x_train.mean(), x_train.std())

In [11]:
#export
def test_near_zero(x): assert x < 1e-3

In [12]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

It is important to note that x_valid is normalized with x_train stats so it will not have mean of zero or std of 1

In [13]:
x_valid.mean(), x_valid.std()

(tensor(0.1286), tensor(0.3050))

## Why do we want normal stats at each layer?

This is what happens when you don't do that

In [33]:
x_train_raw, _ , _ , _ = get_data()

(0.13044983, 0.3072898)

In [39]:
x_train_raw = tensor(x_train_raw)
x_train_raw.mean(), x_train_raw.std()

(tensor(0.1304), tensor(0.3073))

In [40]:
# initializing parameters
w1 = torch.randn(784, 50)
w2 = torch.randn(50, 50)
b = torch.zeros(50)

In [41]:
def linear(x,w,b): return x@w + b

In [44]:
def model(x): #simple model of 5 linear layers
    l1 = linear(x_train_raw, w1, b1)
    l2 = linear(l1, w2, b2)
    l3 = linear(l2, w2, b2)
    l4 = linear(l3, w2, b2)
    out = linear(l4, w2, b2)
    return out
    

In [45]:
out = model(x_train_raw)
out.mean(), out.std()

(tensor(1033.7676), tensor(33912.4062))

We can see here than the mean and variance are already huge even in just a 5 layer network so clearly this isn't going to work it will be impossible to compute gradients in a deeper network

## Solution: Kaiming Initialization

In [50]:
# initializing parameters
w1 = torch.randn(784, 50) * math.sqrt(1/50)
w2 = torch.randn(50, 50) * math.sqrt(1/50)
b = torch.zeros(50)

In [51]:
out = model(x_train_raw)
out.mean(), out.std()

(tensor(-0.0184), tensor(1.1432))

These values are much closer 0 mean and 1 std, note that actual kaiming initialization is also multiplied by sqrt(2) to account for relu 

## Forward Pass of Model

In [172]:
nh = 50 #number of hidden layers

In [173]:
n, m = x_train.shape
c = y_train.max() + 1
m, n, c

(784, 50000, tensor(10))

In [256]:
#export
from torch.nn import init

In [257]:
def relu(x): return x.clamp_min(0.)

In [270]:
w1 = torch.zeros(m, nh)
init.kaiming_normal_(w1, mode='fan_out') #good initialization for relu
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1) / math.sqrt(nh)
b2 = torch.zeros(1)

In [340]:
def model(x, stats=False):
    l1 = linear(x, w1, b1)
    l2 = relu(l1)
    out = linear(l1, w2, b2)
    if stats == True:
        print_stats(x)
        print_stats(l1)
        print_stats(l2)
        print_stats(out)
        
    return out

In [274]:
#export
def print_stats(x):
    print(x.mean(),x.std())

In [276]:
out = model(x_train) #mean and std looks good at every layer!

tensor(0.0001) tensor(1.)
tensor(-0.0085) tensor(1.3826)
tensor(0.5389) tensor(0.8097)
tensor(0.4381) tensor(1.0625)


## Backward Pass
now our model is making predictions we need to compute the gradients to improve performance we will use mse as our loss function

In [323]:
#export
def mse(pred, true): return(pred.squeeze(-1)-true).pow(2).mean()

In [324]:
mse(out, y_train)

tensor(24.5303)

In [335]:
def mse_grad(inp, targ):
    inp.g =  2 * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [336]:
def relu_grad(inp, out):
    inp.g = (inp > 0).float() * out.g

In [337]:
def linear_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [341]:
def forward_and_backward(inp, targ):
    l1 = linear(inp, w1, b1)
    l2 = relu(l1)
    out = linear(l2, w2, b2)
    
    mse_grad(out, y_train)
    linear_grad(l2, out, w2, b2)
    relu_grad(l1, l2)
    linear_grad(inp, l1, w1, b1)

In [342]:
forward_and_backward(x_train, y_train)

In [343]:
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig  = x_train.g.clone()

In [596]:
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [597]:
def forward(inp, targ):
    # forward pass:
    l1 = inp @ w12 + b12
    l2 = relu(l1)
    out = l2 @ w22 + b22
    # we don't actually need the loss in backward!
    return mse(out, targ)

In [598]:
loss = forward(xt2, y_train)


In [599]:
loss.backward()

In [6]:
#export
def test_near(a,b): return torch.allclose(a,b, rtol=1e-3)

In [362]:
test_near(w22.grad, w2g)
test_near(b22.grad, b2g)
test_near(w12.grad, w1g)
test_near(b12.grad, b1g)
test_near(xt2.grad, ig)

True

## Refactoring Layers as Classes

In [460]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out
    
    def backward(self):
        self.inp.g = (self.inp > 0).float() * out.g
        #print(self.inp.g)

In [461]:
class Lin():
    def __init__(self, w, b):
        self.w = w
        self.b = b
    def __call__(self, inp):
        self.inp = inp
        self.out = inp@self.w + self.b
        return self.out
    def backward(self):
#         print('inp', self.inp.shape)
#         print('w', self.w.shape)
#         print('out', self.out.shape)
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [462]:
class Mse():
    def __call__(self, inp, targ):
        self.inp = inp
        self.targ = targ
        self.out = (inp.squeeze() - targ).pow(2).mean()
        return self.out
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [463]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [484]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model(w1, b1, w2, b2)

In [485]:
loss = model(x_train, y_train)

In [486]:
model.backward()

In [487]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

True

## Refactoring to Inherit from Module

In [579]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)

In [580]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)
    def bwd(self, out, inp): inp.g = (inp > 0).float() * out.g

In [581]:
class Mse(Module):
    def forward(self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g =  2. * (inp.squeeze() - targ).unsqueeze(-1) / targ.shape[0]

In [582]:
class Lin(Module):
    def __init__(self, w, b): self.w, self.b = w, b
    def forward(self, inp):return inp@self.w + self.b
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
        self.b.g = out.g.sum(0)

In [583]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [607]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model(w1, b1, w2, b2)

In [608]:
%time loss = model(x_train, y_train)

CPU times: user 106 ms, sys: 8.18 ms, total: 115 ms
Wall time: 93.5 ms


In [609]:
model.backward()

In [610]:
test_near(w22.grad, w2.g)
test_near(b22.grad, b2.g)
test_near(w12.grad, w1.g)
test_near(b12.grad, b1.g)

True

## Conclusion

Now we have implemented forward and backward pass and our code looks pretty clean!

In [10]:
!python nb2script.py fully_connected.ipynb

Converted fully_connected.ipynb to exp/nb_fully.py
