In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
train = pd.read_csv("data/train.csv").to_numpy()
test = pd.read_csv("data/test.csv").to_numpy()

train[:,0]

x_train, y_train = train[:,1:], train[:,0]
x_test, y_test = test[:,1:], test[:,0]
x_train.shape

(42000, 784)

In [4]:
class Linear:
    def __init__(self, in_features, out_features) -> None:
        self.weights = np.random.uniform(-1,1,(out_features, in_features)) - 0.5
        self.bias = np.random.uniform(-1,1,(out_features, 1)) - 0.5
    
    def linear(self, x):
        """"""
        return self.weights.dot(x) + self.bias


class NeuralNetwork():
    def __init__(self) -> None:
        self.l1 = Linear(784, 15)
        self.l2 = Linear(15, 10)
    
    def forward(self, x):
        "x = (n x 784)"
        x = x.T # (n x 784) --> (784 x n)
        self.z1 = self.l1.linear(x) # (784 x n) --> (20 x n)
        self.a1 = self.ReLU(self.z1)
        self.z2 = self.l2.linear(self.a1) # (20 x n) --> (10 x n)
        self.a2 = self.softmax(self.z2) # (n x 10)
        return self.a2

    def backwards(self, y, x):
        m = y.size
        y = self.one_hot(y)
        dZ2 = self.a2 - y.T
        print(dZ2, self.a1.T)
        print(dZ2.dot(self.a1.T))
        # dW2 = 1 / m * dZ2.dot(self.a1.T)
        # db2 = 1 / m * np.sum(dZ2, 1)
        # dZ1 = self.l2.weights.T.dot(dZ2) * self.deriv_ReLU(self.z1)
        # dW1 = 1 / m * dZ1.dot(x)
        # db1 = 1 / m * np.sum(dZ1, 1)
        return #dW1, db2, dW2, db2

    @staticmethod
    def one_hot(y):
        enc_y = np.zeros((y.size, y.max()+1))
        enc_y[np.arange(y.size), y] = 1
        return enc_y
    
    @staticmethod
    def ReLU(x):
        return np.maximum(0,x)
    
    @staticmethod
    def deriv_ReLU(Z):
        return Z > 0

    @staticmethod
    def softmax(x):
        """This returns the row-wise softmax of a numpy array"""
        # stabilized = x - np.max(x, axis=0)
        e_x = np.exp(x)
        x = e_x / np.sum(e_x)
        return x



In [5]:
model = NeuralNetwork()
y_pred = model.forward(x_train)
y_pred[:,0]
# model.z1[:,0]
# model.backwards(y_train, x_train)


array([4.41006910e-06, 2.14481809e-06, 1.53121514e-06, 2.28343590e-06,
       3.79404268e-06, 2.73866237e-06, 4.06158901e-06, 8.81143157e-07,
       8.63109242e-07, 1.10143912e-06])

In [6]:
def cross_entropy(y, y_pred):
    loss = -np.sum(y*np.log(y_pred.T))
    return loss
y = model.one_hot(y_train)
cross_entropy(y, y_pred)


550872.6430465538

In [7]:
data = np.array(train)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into dev and training sets

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255.

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape

In [8]:
def init_params():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A
    
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def ReLU_deriv(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2

In [9]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2

In [10]:
# W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)

## Nanograd

In [1]:
from nanograd.nn import MLP

ModuleNotFoundError: No module named 'autograd'

In [170]:
model = MLP(3, [4,4,1])

xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]

ys = [1.0, -1.0, -1.0, 1.0]

In [171]:
for k in range(120):
    # forward pass
    y_pred = [n(x) for x in xs]
    loss = sum((y_out - y_gt)**2 for y_gt, y_out in zip(ys, y_pred))

    # backward pass
    n.zero_grad()
    loss.backward()
    
    # update
    for p in n.parameters():
        p.data += -0.03 * p.grad
    
    if k % 10 == 0:
        print(k, loss.data)


0 0.0
10 0.0
20 0.0
30 0.0
40 0.0
50 0.0
60 0.0
70 0.0
80 0.0
90 0.0
100 0.0
110 0.0


In [168]:
y_pred

[Value(data=1.0, grad=0.0),
 Value(data=-1.0, grad=0.0),
 Value(data=-1.0, grad=0.0),
 Value(data=1.0, grad=0.0)]

In [52]:
import math
import random
class Value:
    """ stores a single scalar value and its gradient """

    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out
    
    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def _backward():
            #https://en.wikipedia.org/wiki/Hyperbolic_functions#Derivatives
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        return out

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"

In [53]:
from graphviz import Digraph

def trace(root):
    # builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir':'LR'})

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any values in the graph, create a rectangular record node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f}" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it 
            dot.node(name = uid + n._op, label = n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)

    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    
    return dot

In [54]:
class Module:

    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0

    def parameters(self):
        return []

class Neuron(Module):

    def __init__(self, nin, nonlin=True):
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(0)
        self.nonlin = nonlin

    def __call__(self, x):
        act = sum((wi*xi for wi,xi in zip(self.w, x)), self.b)
        return act.relu() if self.nonlin else act

    def parameters(self):
        return self.w + [self.b]

    def __repr__(self):
        return f"{'ReLU' if self.nonlin else 'Linear'}Neuron({len(self.w)})"

class Layer(Module):

    def __init__(self, nin, nout, **kwargs):
        self.neurons = [Neuron(nin, **kwargs) for _ in range(nout)]

    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

    def __repr__(self):
        return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"

class MLP(Module):

    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1], nonlin=i!=len(nouts)-1) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    def __repr__(self):
        return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"

In [160]:
n = MLP(3, [10,4,1])

for k in range(300):
    # forward pass
    y_pred = [n(x) for x in xs]
    loss = sum((y_out - y_gt)**2 for y_gt, y_out in zip(ys, y_pred))

    # backward pass
    n.zero_grad()
    loss.backward()
    
    # update
    for p in n.parameters():
        p.data += -0.03 * p.grad
    
    if k % 10 == 0:
        print(k, loss.data)


0 15.644302330276453
10 0.12789674936858736
20 0.02669108364639066
30 0.002716197150222608
40 0.00026326915684857726
50 2.38239795562712e-05
60 2.0954969835986387e-06
70 1.827411707068524e-07
80 1.5895734839687593e-08
90 1.3816510721278513e-09
100 1.200659505247582e-10
110 1.0433090182590326e-11
120 9.065623759111399e-13
130 7.87734658627935e-14
140 6.844811463879642e-15
150 5.947614628229596e-16
160 5.168018518533631e-17
170 4.490610343309173e-18
180 3.9019945862954174e-19
190 3.3905337950954747e-20
200 2.9460908076470287e-21
210 2.559953789497077e-22
220 2.2245557212727158e-23
230 1.932671019667334e-24
240 1.6792024796633683e-25
250 1.4636118479066398e-26
260 1.2758469287269017e-27
270 1.1312758418935072e-28
280 9.577264427448846e-30
290 1.836566794967668e-30


In [161]:
y_pred

[Value(data=1.0000000000000007, grad=1.3322676295501878e-15),
 Value(data=-0.9999999999999999, grad=2.220446049250313e-16),
 Value(data=-0.9999999999999996, grad=8.881784197001252e-16),
 Value(data=0.9999999999999996, grad=-8.881784197001252e-16)]

In [152]:


# # forward pass
y_pred = [n(x) for x in xs]
loss = sum((y_out - y_gt)**2 for y_gt, y_out in zip(ys, y_pred))

# backward pass
n.zero_grad()
loss.backward()

# update
for p in n.parameters():
    p.data += -0.05 * p.grad

loss, y_pred

(Value(data=2.9400018210573545e-05, grad=1),
 [Value(data=0.9965012300542382, grad=-0.00699753989152363),
  Value(data=-0.9979128801676643, grad=0.004174239664671342),
  Value(data=-0.9992508893611005, grad=0.0014982212777989723),
  Value(data=0.9965012300542382, grad=-0.00699753989152363)])

[Value(data=3.063175324560093, grad=0),
 Value(data=-5.174558671545837, grad=0),
 Value(data=0.8995198412257644, grad=0),
 Value(data=0.7388069500948937, grad=0)]

In [839]:
n.layers[0].parameters()

[Value(data=0.9952375350608573, label=),
 Value(data=0.3481276520024903, label=),
 Value(data=-0.34357154982839777, label=),
 Value(data=0.25998945495712955, label=),
 Value(data=0.18451892991351748, label=),
 Value(data=-0.23510638454343002, label=),
 Value(data=-0.4906381666045556, label=),
 Value(data=-0.7181546648417907, label=)]