In [1]:
import os
import math
import random
from mnist import MNIST
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import helper

import pickle
import time

%matplotlib inline

### Load Data

In [2]:
mndata = MNIST('dataset')

x_train, y_train = mndata.load_training()

In [3]:
# Normalize input
# append one feature  (for the bias term)

x_train_normalized = [image[:] for image in x_train]


for image in x_train_normalized:    
    for idx, color in enumerate(image):
        image[idx] = color / 255
    
    # for the bias
    image.append(1)

### Linear Progression

In [10]:

def flatten(array):
    flat = []
    for item in array:
        try:
            iter(item)
            flat.extend(flatten(item))
        except:
            flat.append(item)
    return flat

def sigmoid_scalar(x):
    if x < 0:
        sigmoid = math.exp(x) / (1 + math.exp(x))
    else:
        sigmoid = 1 / (1 + math.exp(-x))

    # to handle machine precision errors
    sigmoid = max(0.0001, sigmoid)
    sigmoid = min(0.9999, sigmoid)

    return sigmoid


def sigmoid_vector(X):
    flat = flatten(X)

    for i in range(len(flat)):
        flat[i] = sigmoid_scaler(flat[i])

    return flat


def predict(sample_x, weight):
    # calculate z = sum(w * x + b)
    # here bias (b) is also included in weight
    z = 0
    for x, w in zip(sample_x, weight):
        z += w * x

    # sigmoid(z)
    return sigmoid_scaler(z)


def cross_entropy_loss(predicted_value, actual_value):
    y = actual_value
    y_pred = predicted_value

    if y == 1:
        return -math.log(y_pred + 0.000001)

    else:
        return -math.log(1 - y_pred + 0.000001)


def cost_function(datas, target, weight, bias):
    cost = 0

    for x, y in zip(datas, target):
        y_pred = predict(x, weight, bias)

        cost += cross_entropy_loss(y_pred, y)

    return cost / len(datas)


In [11]:
def gradient_decent(X, label, weight=None, learning_rate=0.1):
    n_features = len(X[0])

    if weight == None:
        weight = [0] * n_features

    loss = 0

    for x, y in zip(X, label):
        y_pred = predict(x, weight)
        loss += cross_entropy_loss(y_pred, y)

        # dw = (y_pred - y) * x
        # weight = weight - learning_rate * dw
        err = y_pred - y
        for i in range(n_features):
            dw_i = err * x[i]
            weight[i] -= learning_rate * dw_i

    return weight, loss / len(X)

In [12]:
def train_binary_class(x_train, y_train, learning_rate, epoch, verbose=False):
    weights = None

    # for graphing
    history = []

    for i in range(epoch):
        weights, loss = gradient_decent(x_train, y_train, weights, learning_rate)

        history.append(loss)

        if verbose:
            print(f"Epoch [{i}]\n\t- Cross entropy loss: {loss}\n")

    return weights, history

In [13]:
def train_multiclass(x_train, y_train, learning_rate, epoch, verbose=False):
    # identify unique classes
    classes = range(0, 10)

    # for each class create separate labels suitable for binary classification
    labels = [[] for _ in classes]

    for class_ in classes:
        for label in y_train:            
            if label == class_:
                labels[class_].append(1)

            else:
                labels[class_].append(0)
    
    # now that we have separate labels for each class
    # lets train binary classifier for each class
    # (each classifier will identify whether sample x is member of class or not)

    # we have 10 classes so we need 10 binary classifiers
    classifiers = [None] * 10
    histories = [None] * 10

    for cls_, label in enumerate(labels):
        if verbose:
            print(f'Training class [{cls_}]')

        weights, history = train_binary_class(x_train, label, learning_rate, epoch)        

        classifiers[cls_] = weights
        histories[cls_] = history

        if verbose:
            print('---------------------------------------------------------------------')
    

    return classifiers, histories

In [14]:
def argmax(values):
    max_value = float('-inf')
    max_index = 0

    for idx, value in enumerate(values):
        if value > max_value:
            max_index = idx
            max_value = value

    return max_index

In [18]:
def predict_class(sample_x, model):
    class_probabilities = []
    
    for weights in model:
        class_probability = predict(sample_x, weights)
        class_probabilities.append(class_probability)
    
    return argmax(class_probabilities)

In [19]:
# sum([predict_class(x_train_normalized[i], model) == y_train[i] for i in range(60000)])

def confusion_matrix(test_x, label, model):
    grid = [[0] * 10 for _ in range(10)]
    net_accuracy = 0

    for i in range(len(test_x)):
        prediction = predict_class(test_x[i], model)

        grid[prediction][label[i]] += 1

        if prediction == label[i]:
            net_accuracy += 1              

    return grid, net_accuracy / len(test_x)

### Training

In [20]:
learning_rates = [0.0001, 0.001, 0.01, 0.1, 1.0, 1.5]

epochs = 10

train_size = 50000

x_train_set = x_train_normalized[:train_size]
y_train_set = y_train[:train_size]

x_validation_set = x_train_normalized[train_size:]
y_validaiton_set = y_train[train_size:]


histories = []

for eta in learning_rates:
    print(f'Learning rate: {eta}')

    start_time = time.perf_counter()

    model, history = train_multiclass(x_train_set, y_train_set, eta, epochs, verbose=True)

    time_taken = time.perf_counter() - start_time  

    with open(f'learning_rate_[{eta}].m', 'wb') as f:
        pickle.dump(model, f)

    histories.append(
      (   time_taken,
          histories,
          *confusion_matrix(x_validation_set, y_validaiton_set, model)
      )
    )

Learning rate: 0.0001
Training class [0]
---------------------------------------------------------------------
Training class [1]
---------------------------------------------------------------------
Training class [2]
---------------------------------------------------------------------
Training class [3]
---------------------------------------------------------------------
Training class [4]
---------------------------------------------------------------------
Training class [5]
---------------------------------------------------------------------
Training class [6]
---------------------------------------------------------------------
Training class [7]
---------------------------------------------------------------------
Training class [8]
---------------------------------------------------------------------
Training class [9]
---------------------------------------------------------------------
Learning rate: 0.001
Training class [0]
------------------------------------------------

OverflowError: math range error