In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
#export
from exp.nb_01 import *
print(MNIST_URL)
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s
#subtract mean, divide by std

http://deeplearning.net/data/mnist/mnist.pkl


In [3]:
#export
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [4]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)

In [5]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)-0.5
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [6]:
class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
        
    def forward(self, inp): return inp@self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = inp.t() @ out.g
        #self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [7]:
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [8]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [9]:
x_train,y_train,x_valid,y_valid = get_data()

In [10]:
train_mean,train_std = x_train.mean(),x_train.std()
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [11]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

In [12]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

In [13]:
# num hidden
nh = 50

In [14]:
# simplified kaiming init / he init
#784, 50
w1 = torch.randn(m,nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

In [15]:
test_near_zero(w1.mean())
test_near_zero(w1.std()-1/math.sqrt(m))

In [16]:
# This should be ~ (0,1) (mean,std)...
x_valid.mean(),x_valid.std()

(tensor(-0.0059), tensor(0.9924))

In [17]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2/m)

In [18]:
#export
from torch.nn import init

In [19]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out')

tensor([[ 0.0145, -0.1102, -0.0407,  ...,  0.0286, -0.0382,  0.0522],
        [ 0.0393,  0.0239, -0.0364,  ...,  0.0552,  0.0579, -0.0310],
        [-0.0407,  0.0530,  0.0006,  ...,  0.0828, -0.0450,  0.0398],
        ...,
        [-0.0649, -0.0673, -0.0092,  ..., -0.0082,  0.0319, -0.0332],
        [-0.0101, -0.0075,  0.0413,  ...,  0.0325,  0.0323, -0.0334],
        [ 0.0587,  0.0136,  0.0653,  ...,  0.0829, -0.0224, -0.0728]])

In [20]:
def lin(x, w, b): return x@w + b

In [21]:
def relu(x): return x.clamp_min(0.)
#replace negatives with 0

In [22]:
w1 = torch.randn(m,nh)*math.sqrt(2./m )
t1 = relu(lin(x_valid, w1, b1))
t1.mean(),t1.std()

(tensor(0.6655), tensor(0.8863))

In [23]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [24]:
%time loss = model(x_train, y_train)

CPU times: user 92.1 ms, sys: 0 ns, total: 92.1 ms
Wall time: 15.4 ms


In [26]:
!python notebook2script.py 02_fully_connected.ipynb

Converted 02_fully_connected.ipynb to exp/nb_02.py
