In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [37]:
train = pd.read_csv("data/train.csv").to_numpy()
test = pd.read_csv("data/test.csv").to_numpy()

train[:,0]

x_train, y_train = train[:,1:], train[:,0]
x_test, y_test = test[:,1:], test[:,0]
x_train.shape

(42000, 784)

In [428]:
class Linear:
    def __init__(self, in_features, out_features) -> None:
        self.weights = np.random.uniform(-1,1,(out_features, in_features)) - 0.5
        self.bias = np.random.uniform(-1,1,(out_features, 1)) - 0.5
    
    def linear(self, x):
        """"""
        return self.weights.dot(x) + self.bias


class NeuralNetwork():
    def __init__(self) -> None:
        self.l1 = Linear(784, 15)
        self.l2 = Linear(15, 10)
    
    def forward(self, x):
        "x = (n x 784)"
        x = x.T # (n x 784) --> (784 x n)
        self.z1 = self.l1.linear(x) # (784 x n) --> (20 x n)
        self.a1 = self.ReLU(self.z1)
        self.z2 = self.l2.linear(self.a1) # (20 x n) --> (10 x n)
        self.a2 = self.softmax(self.z2) # (n x 10)
        return self.a2

    def backwards(self, y, x):
        m = y.size
        y = self.one_hot(y)
        dZ2 = self.a2 - y.T
        print(dZ2, self.a1.T)
        print(dZ2.dot(self.a1.T))
        # dW2 = 1 / m * dZ2.dot(self.a1.T)
        # db2 = 1 / m * np.sum(dZ2, 1)
        # dZ1 = self.l2.weights.T.dot(dZ2) * self.deriv_ReLU(self.z1)
        # dW1 = 1 / m * dZ1.dot(x)
        # db1 = 1 / m * np.sum(dZ1, 1)
        return #dW1, db2, dW2, db2

    @staticmethod
    def one_hot(y):
        enc_y = np.zeros((y.size, y.max()+1))
        enc_y[np.arange(y.size), y] = 1
        return enc_y
    
    @staticmethod
    def ReLU(x):
        return np.maximum(0,x)
    
    @staticmethod
    def deriv_ReLU(Z):
        return Z > 0

    @staticmethod
    def softmax(x):
        """This returns the row-wise softmax of a numpy array"""
        # stabilized = x - np.max(x, axis=0)
        e_x = np.exp(x)
        x = e_x / np.sum(e_x)
        return x



In [429]:
model = NeuralNetwork()
y_pred = model.forward(x_train)
y_pred[:,0]
# model.z1[:,0]
# model.backwards(y_train, x_train)


array([1.41125440e-06, 1.21228227e-06, 3.01301692e-06, 3.67054923e-06,
       1.64355488e-06, 8.79970224e-07, 8.77125899e-07, 5.05844329e-06,
       2.30539000e-06, 3.73793672e-06])

In [386]:
def cross_entropy(y, y_pred):
    loss = -np.sum(y*np.log(y_pred.T))
    return loss
y = model.one_hot(y_train)
cross_entropy(y, y_pred)


549450.3498593074

In [430]:
data = np.array(train)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into dev and training sets

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255.

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape

In [431]:
def init_params():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A
    
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def ReLU_deriv(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2

In [432]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2

In [433]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)

Iteration:  0
[3 4 1 ... 7 7 8] [5 2 7 ... 4 5 9]
0.08529268292682927
Iteration:  10
[3 6 1 ... 7 4 7] [5 2 7 ... 4 5 9]
0.1665609756097561
Iteration:  20
[6 6 7 ... 7 4 7] [5 2 7 ... 4 5 9]
0.23434146341463416
Iteration:  30
[6 6 7 ... 7 4 7] [5 2 7 ... 4 5 9]
0.301390243902439
Iteration:  40
[6 6 7 ... 7 8 7] [5 2 7 ... 4 5 9]
0.3695609756097561
Iteration:  50
[6 6 7 ... 7 8 7] [5 2 7 ... 4 5 9]
0.4281951219512195
Iteration:  60
[6 6 7 ... 7 8 7] [5 2 7 ... 4 5 9]
0.47524390243902437
Iteration:  70
[6 6 7 ... 7 0 7] [5 2 7 ... 4 5 9]
0.5254146341463415
Iteration:  80
[6 6 7 ... 7 0 7] [5 2 7 ... 4 5 9]
0.5767317073170731
Iteration:  90
[6 6 7 ... 7 0 7] [5 2 7 ... 4 5 9]
0.613780487804878
Iteration:  100
[6 2 7 ... 7 0 9] [5 2 7 ... 4 5 9]
0.6401951219512195
Iteration:  110
[6 2 7 ... 7 0 9] [5 2 7 ... 4 5 9]
0.6633902439024391
Iteration:  120
[6 2 7 ... 7 0 9] [5 2 7 ... 4 5 9]
0.6805609756097561
Iteration:  130
[6 2 7 ... 7 0 9] [5 2 7 ... 4 5 9]
0.6972682926829268
Iteration:  140


In [4]:
# Using nanograd
from nanograd.value import Value

a = Value(2)


ImportError: cannot import name 'Value' from 'nanograd.value' (/Users/jonah.breslow/dev/neural-net-numpy/nanograd/value.py)

In [439]:
value(3)

TypeError: 'module' object is not callable