In [1]:
import random
import pickle

import numpy as np
import matplotlib.pyplot as plt

In [4]:
def read_data(x_file, y_file):
    X = np.loadtxt(x_file, delimiter=',')
    Y = np.loadtxt(y_file, delimiter=",")
    return X, Y

In [141]:
def one_hot_label(Y):
    size = int(np.max(Y) + 1)
    return np.eye(size)[Y.astype(int)]

In [57]:
def soft_max(Y):
    C = np.max(Y, axis = 1, keepdims=True)
    temp = np.exp(Y - np.log(C))
    return temp/np.sum(temp, axis = 1, keepdims=True)

In [94]:
def sigmoid(X):
    return 1. / (1 + np.exp(- X))

$$X = \begin{bmatrix} ----(x_1) ---- \,1 \\ \vdots \\ ----(x_m) ---- \,1 \end{bmatrix}$$
And 
$$\theta = \begin{bmatrix} \theta_1 \\ \vdots \\ \theta_n \\ 1\end{bmatrix}$$

In [79]:
def init_params(*shape):
    output = []
    for i in range(1, len(shape[0])):
        output.append(np.random.normal(size = (shape[0][i - 1] + 1, shape[0][i])))
        output[-1][-1, :] = 0
    return output

In [139]:
def forward_prop(thetas, activation, X):
    output = [X]
    for i in range(len(thetas)):
        bias = np.ones((output[-1].shape[0], 1))
        output[-1] = np.c_[output[-1], bias]
        temp = output[-1] @ thetas[i]
        output.append(temp)
        output.append(activation[i](temp))
    return output

In [167]:
def crossEntropy(y_hat, y):
    return np.mean(- np.log(y_hat) * y, axis = 0)

In [83]:
def loss(entrophyV):
    return np.mean(entrophyV)

In [177]:
def predict(y): # output a integer vector, not one-hot
    return np.argmax(y, axis = 1)

In [194]:
def accuracy(y_hat, y):
    return y[y_hat==y].size / y.size

Update W:
$$[\frac{\partial \ell}{\partial O} \frac{\partial O}{\partial a}] \frac{\partial a}{\partial w}$$
Pass to next layer:
$$[[\frac{\partial \ell}{\partial O} \frac{\partial O}{\partial a}] \frac{\partial a}{\partial h} \frac{\partial h}{\partial a}] \frac{\partial a}{\partial w}$$

In [197]:
def back_prop(thetas, values, loss, alpha):

    grad1 = loss / values[-1]  * (values[-1] - values[-1]** 2)
    print(grad1.shape)
    print(thetas[-1].shape)
    print(values[-3].shape)
    grad2 = grad1 * thetas[-1] * (1 - value[-3]) * value[-3] 
    
    thetas[-1] += grad1 * values[-2] * alpha
    thetas[-2] += grad2 * value[-5] * alpha
    
    return thetas

In [15]:
raw_trainX, raw_trainY = read_data('dataset/images_train.csv', 'dataset/labels_train.csv')

In [16]:
raw_testX, raw_testY = read_data('dataset/images_test.csv', 'dataset/labels_test.csv')

-----

In [91]:
thetas = init_params([raw_trainX.shape[1], 100, 10])

In [146]:
y = one_hot_label(raw_trainY[:5])

In [108]:
thetas[1].shape

(101, 10)

In [168]:
output = forward_prop(thetas, [sigmoid, soft_max], raw_trainX[:5, :])

In [169]:
entrophy = crossEntropy(output[-1], y)

In [171]:
l = loss(entrophy)

In [198]:
back_prop(thetas, output, entrophy, 1)

(5, 10)
(101, 10)
(5, 101)


ValueError: operands could not be broadcast together with shapes (5,10) (101,10) 

In [195]:
accuracy(predict(output[-1]), raw_trainY[:5])

0.0