In [203]:
from mnist import MNIST
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

import math

def normalize(x):
    return (x - x.mean())/x.std()

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [170]:
MNIST

mnist.loader.MNIST

In [171]:
mndata = MNIST('./data/mnist/')
train_images, train_labels = mndata.load_training()
test_images, test_labels = mndata.load_testing()

train_images = np.array(train_images)
test_images = np.array(test_images)

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [172]:
train_images.shape

(60000, 784)

In [357]:
def get_num_layers_from_params(params):
    return int(len(params)/2)

def initialize(num_nodes_per_layer, input_size):    
    params = {}
    
    prev_input_size = input_size
    for l, n in enumerate(num_nodes_per_layer):
        bound = np.sqrt(2/prev_input_size)
        params['W' + str(l + 1)] = np.random.uniform(-bound, bound, (n, prev_input_size))
        params['b' + str(l + 1)] = np.zeros((n, 1))
        prev_input_size = n
        
    return params

def predict(x, params):
    a = x.reshape((x.shape[0],1))
    for l in range(1, get_num_layers_from_params(params) + 1):
        W = params['W' + str(l)]
        b = params['b' + str(l)]
        z = W.dot(a) + b
#         print(z)
        a = sigmoid(z)
#     print(a.shape)
    return a.reshape((a.shape[0],))

def update_weights(a_prev, a, W, b, learning_rate):
    gradient = a*(1 - a)*a_prev.T
    sm_a = softmax(a)
    delta_W = sm_a*W*(gradient)*learning_rate
    delta_W = delta_W/(np.square(W) + 10)
    W = W + delta_W
    W = np.apply_along_axis(lambda x: x/x.sum(), 1, W)
#     delta_b = sm_a*(a*(1 - a))*learning_rate
#     b = b + delta_b
    return W, b

def unsupervised_train(X, params, learning_rate=0.1):
    params = params.copy()
    
    for i in range(X.shape[0]):
        x = X[i, :].T
        a_prev = x.reshape((x.shape[0],1))
        for l in range(1, get_num_layers_from_params(params) + 1):
            W = params['W' + str(l)]
            b = params['b' + str(l)]
            z = W.dot(a_prev) + b
#             if i % 100 == 0:
#                 print(W.sum().sum())
#                 print(z)
            a = sigmoid(z)
            params['W' + str(l)], params['b' + str(l)] = update_weights(a_prev, a, W, b, learning_rate)
            a_prev = a
    return params

def transform_X(X, params):
    return np.apply_along_axis(lambda x: predict(x, params), 1, X)
        


In [372]:
n = 1000
X = train_images[0:n, :]
y = train_labels[0:n]

X = np.apply_along_axis(normalize, 1, X)
params = initialize([20, 15], 784)
trained_params = unsupervised_train(X, params)
transformed_X = transform_X(X, trained_params)

lr = LogisticRegression()
lr.fit(transformed_X, y)

test_X = np.apply_along_axis(normalize, 1, test_images)
trained_params_score = lr.score(transform_X(test_X, trained_params), test_labels)
untrained_params_score = lr.score(transform_X(test_X, params), test_labels)

print('trained: ', trained_params_score, '     untrained: ', untrained_params_score)

# lr.fit(X, y).score(test_images, test_labels)

  if sys.path[0] == '':


trained:  0.4282      untrained:  0.0633


In [364]:
transform_X(test_X, trained_params).shape

(10000, 10)

In [363]:
for k, v in trained_params.items():
    print(k, v.shape)

W1 (20, 784)
b1 (20, 1)
W2 (10, 20)
b2 (10, 1)
W3 (10, 10)
b3 (10, 1)
