# MLP from scatch

In [2]:
# loading in packages and data
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])
     

# Foundations

## Basic architecture

In [4]:
n, m = x_train.shape
c = y_train.max() + 1
n, m, c

(50000, 784, tensor(10))

In [5]:
#num hidden
n_hidden = 50

In [8]:
w1 = torch.randn(m, n_hidden)
b1 = torch.zeros(n_hidden)
w2 = torch.randn(n_hidden, 1) # just one ouput, what number it might be from 0-10
b2 = torch.zeros(1)

In [9]:
def lin(x, w, b):
    return x@w + b

In [10]:
x_valid.shape

torch.Size([10000, 784])

In [11]:
t = lin(x_valid, w1, b1)
t.shape

torch.Size([10000, 50])

In [12]:
def relu(x):
    return x.clamp_min(0.)

In [13]:
t

tensor([[  8.84,  -0.79, -14.96,  ...,   3.93,   6.07,  -8.98],
        [-12.38, -13.91, -14.78,  ...,   4.87,  11.59,  -7.35],
        [-14.00,  -1.00,  -6.35,  ...,   2.09,  10.47,  -7.58],
        ...,
        [ -0.77, -16.81,  -9.85,  ...,  -6.92,   7.32,  -0.93],
        [ -4.44, -11.85,  -7.86,  ...,   6.45,   8.31,  -5.68],
        [  8.37,  -8.44,  -2.33,  ...,   9.87,   9.05, -11.56]])

In [15]:
t = relu(t)
t

tensor([[ 8.84,  0.00,  0.00,  ...,  3.93,  6.07,  0.00],
        [ 0.00,  0.00,  0.00,  ...,  4.87, 11.59,  0.00],
        [ 0.00,  0.00,  0.00,  ...,  2.09, 10.47,  0.00],
        ...,
        [ 0.00,  0.00,  0.00,  ...,  0.00,  7.32,  0.00],
        [ 0.00,  0.00,  0.00,  ...,  6.45,  8.31,  0.00],
        [ 8.37,  0.00,  0.00,  ...,  9.87,  9.05,  0.00]])

In [16]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [17]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

Loss function: MSE

In [18]:
y_valid.shape

torch.Size([10000])

In [26]:
loss = res.squeeze() - y_valid# all unit dimensions get removed
loss.shape

torch.Size([10000])

In [25]:
loss

tensor([  3.97,  12.58, -24.16,  ..., -32.30, -21.26, -56.63])

In [28]:
def mse(preds, tgt):
    return ((preds.squeeze() - tgt)**2).mean()

In [31]:
preds = model(x_train)

In [32]:
mse(preds, y_train)

tensor(744.75)

## Gradients and backpropogation

- Using the chainrule to move backwards from the loss, and some computational tricks

In [1]:
from sympy import symbols, diff

In [2]:
x, y = symbols('x y')

In [3]:
diff(x**2, x)

2*x

In [None]:
diff(3*x**9, x