In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Download data

In [3]:
from exp.nb_01 import *

def downLoadMNIST():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

x_train,y_train,x_valid,y_valid = downLoadMNIST()


In [4]:
y_train,y_valid = y_train.float(),y_valid.float()

x_train.shape, y_train.shape

(torch.Size([50000, 784]), torch.Size([50000]))

## Normalize Initial data

In [5]:
def normalize(x, m, s): return (x-m)/s
x_train = normalize(x_train, x_train.mean(), x_train.std())
x_valid = normalize(x_valid, x_train.mean(), x_train.std())

## Linear layer function

In [6]:
def linLayer(act, w, b): # act is the activations, w is the weight matrix
    assert(act.shape[1] == w.shape[0]) # check if the no of columns in the activation matrix is the same as rows in the weigth matrix
    return act@w + b

## Relu function

In [7]:
def relu(x): return x.clamp_min(0.)

## Create Weight matrix for first linear layer

In [8]:
# size of weight matrix is x_train.shape[1] x the number of hidden nodes
# Use math.sqrt(m) for kaming init

nh = 10
m = x_train.shape[1]

#w1 = torch.randn(m, nh)/math.sqrt(m) # this is for a linear layer only
w1 = torch.randn(m, nh)*math.sqrt(2/m) # this is for linear layer ito relu 
b1 = torch.zeros(nh)

w1.mean(), w1.std()

(tensor(8.1553e-05), tensor(0.0508))

In [9]:
out = linLayer(x_train[0:100, :], w1, b1)
out.shape, out.mean(), out.std()

(torch.Size([100, 10]), tensor(0.2038), tensor(1.2957))

## Create Weight Matrix for second linear layer

In [10]:
w2 = torch.randn(nh, 1)/math.sqrt(m) # only one output node
b2 = torch.zeros(1)

## Loss Function

In [11]:
def mse(x, y):
    return (x.squeeze_(-1) - y).pow(2).mean()  

## Forward pass

In [12]:
t = linLayer(relu(linLayer(x_train[0:100, :], w1, b1)), w2, b2)

t1 = linLayer(x_train[0:100, :], w1, b1)
t2 = relu(t1)
t3 = linLayer(t2, w2, b2)


t.mean(), t3.mean(), t.std(), t.max(), t.min(), t.shape, t1.shape


(tensor(0.0946),
 tensor(0.0946),
 tensor(0.0561),
 tensor(0.2823),
 tensor(-0.0245),
 torch.Size([100, 1]),
 torch.Size([100, 10]))

In [13]:
loss = mse(t, y_train[0:100])
loss

tensor(26.4131)

In [14]:
y_train, t.squeeze(-1)

(tensor([5., 0., 4.,  ..., 8., 4., 8.]),
 tensor([ 0.2823,  0.1069,  0.0526,  0.1145,  0.1509,  0.0903,  0.0996,  0.1255,
          0.0978,  0.1270,  0.1656,  0.0176,  0.1006,  0.0573,  0.0471,  0.0964,
          0.1129,  0.1665,  0.0651,  0.1587, -0.0085,  0.0504,  0.1029,  0.1199,
          0.0489,  0.1126,  0.1159,  0.1140,  0.0895,  0.0425,  0.1422,  0.0662,
          0.0949,  0.0887,  0.1474,  0.1305,  0.1014,  0.0541,  0.0661,  0.0581,
          0.0612,  0.2567,  0.0879,  0.1375,  0.0589,  0.1331,  0.1204,  0.2344,
          0.0071,  0.1782,  0.1018,  0.0378,  0.0479,  0.0352,  0.1222,  0.1559,
          0.0271,  0.1113,  0.0802,  0.1315,  0.1736,  0.0862,  0.1058,  0.0041,
          0.0254, -0.0114,  0.1074,  0.1255,  0.0826,  0.0824,  0.1008,  0.0273,
          0.0606,  0.0407,  0.1127,  0.0876,  0.0962,  0.1102,  0.0927,  0.0824,
          0.0720,  0.1202,  0.0509,  0.0381,  0.1926,  0.2080,  0.0494,  0.0397,
          0.0213,  0.0664,  0.0493,  0.0240, -0.0245,  0.0673,  0.18

## Pytorch

## Backward pass

In [15]:
# Grad MSE
def mseGrad(inp, targ):
    # grad of loss with respect to output of previous layer
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]
    
# Grad relu

def reluGrad(inp, out):
    inp.g = (inp>0).float() * out.g
    
def linGrad(inp, out, w, b):
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [16]:
def forwardAndBackward(inp, targ):
    l1 = inp @ w1 + b1 # first layer - input mat multiply by weights plus bias
    l2 = relu(l1) # apply relu to activations
    out = l2 @ w2 + b2 # 
    loss = mse(out, targ)
    
    mseGrad(out, targ)
    linGrad(l2, out, w2, b2)
    reluGrad(l1, l2)
    linGrad(inp, l1, w1, b1)
    
    # all you need are the gradients to update the weights and biases
    
    #return loss
    

#f = mseGrad(t.squeeze(-1), y_train[0:100])
#type(w1)
#import numpy as np
#dd = np.ndarray([1, 2, 3])
#dd

In [17]:
forwardAndBackward(x_train[0:100, :], y_train[0:100])


w1.g

tensor([[ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
        [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
        [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
        ...,
        [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
        [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
        [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013]])

In [18]:
mseGrad(t3, y_train[0:100])
linGrad(t2, t3, w2, b2) # where t2 and t3 are the activations either side of the linear layer
reluGrad(t1)

t3.g.shape, w2.g.shape, w2.shape, t1.g.shape, t1.shape

TypeError: reluGrad() missing 1 required positional argument: 'out'

In [19]:
t.shape, y_train[0:100].shape

(torch.Size([100]), torch.Size([100]))

In [20]:
t.requires_grad = True

In [21]:
x = torch.ones(2, 2, requires_grad=True)
print(x)
y = x*2
y = y@y
print(y.grad_fn)
print(y.backward)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
<MmBackward object at 0x7f15d9c616d8>
<bound method Tensor.backward of tensor([[8., 8.],
        [8., 8.]], grad_fn=<MmBackward>)>


## Pytorch API example

In [57]:
# Base class with forward and backward function
class Module():
    def __call__(self, *args): 
        self.args = args
        self.out = self.forward(*args) # calls the forward function
        return self.out
    
    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)


class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b # store the weight and bias tensors
        
    def forward(self, inp): return inp@self.w + self.b # performs calculation - result is stored in self.out 
    # (via __call__ funciton) Forward function requires an input and writes the output
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t() # inp.g not stored..
        self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)
        
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)-0.5
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g
    
    
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [58]:
# Model class - network connectivity
class Model():
    def __init__(self):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)] # layers - list of layer objects
        self.loss = Mse() # loss - don't need loss to calculate gradients
        
    def __call__(self, x, targ): # loops through each layer using the output as the input to the next
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [65]:
#f = Lin(w1, b1)
#f(x_train) # forward layer

#f.backward, f.bwd

model = Model()
model(x_train[0:100, :], y_train[0:100])

model.layers[0].w.g, model.loss.out, model.layers[0].out

#f.backwar

(tensor([[ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
         [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
         [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
         ...,
         [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
         [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013],
         [ 0.0195,  0.0640, -0.0539,  ...,  0.0348,  0.0196,  0.0013]]),
 tensor(27.2052),
 tensor([[ 7.6163e-01,  1.4848e+00,  8.2667e-01, -7.3826e-01,  3.3115e+00,
          -5.0637e-01,  3.6960e+00, -1.2438e+00,  4.3691e-01, -1.3269e+00],
         [-2.3715e+00, -5.9796e-01,  1.4234e+00,  9.3929e-01,  4.9834e-01,
          -7.4829e-01,  2.5454e+00,  6.8939e-01, -7.0383e-01, -7.4188e-01],
         [ 1.2925e+00, -2.3899e-01,  1.6978e+00,  1.1892e-01, -6.7505e-01,
          -2.9154e+00, -1.5898e+00, -1.1483e+00,  1.0485e+00,  4.7840e-01],
         [-9.1694e-01,  1.3774e+00, -3.6955e-01, -3.1413e-01, -1.4994e+00,
           6