In [1]:
import pandas as pd
train = pd.read_csv("MNIST/train.csv")
train.head(1)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
test = pd.read_csv("MNIST/test.csv")
test.head(1)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
print(train.shape, test.shape)

(42000, 785) (28000, 784)


## Preprocessing

In [19]:
X_train = train.drop(['label'], axis = 1).copy()
y_train = train['label']
print(X_train.shape, y_train.shape)
display(X_train.head(1), y_train.head(1))

(42000, 784) (42000,)


Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0    1
Name: label, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split

X_train_f, X_valid_f, y_train_f, y_valid_f = train_test_split(X_train, y_train, test_size = 0.2, random_state = 1)

print(X_train_f.shape, X_valid_f.shape)
print(y_train_f.shape, y_valid_f.shape)

(33600, 784) (8400, 784)
(33600,) (8400,)


In [24]:
from keras.utils import to_categorical

y_train_hot = to_categorical(y_train_f)
y_valid_hot = to_categorical(y_valid_f)

print(y_train_hot.shape, y_valid_hot.shape)

(33600, 10) (8400, 10)


In [25]:
X_train_f = X_train_f.T
X_valid_f = X_valid_f.T
y_train_hot = y_train_hot.T
y_valid_hot = y_valid_hot.T

print(X_train_f.shape, X_valid_f.shape)
print(y_train_hot.shape, y_valid_hot.shape)


(784, 33600) (784, 8400)
(10, 33600) (10, 8400)


## Define Sigmoid, Cross-Entropy

In [27]:
import numpy as np
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


In [28]:
def cross_entropy(actual, predict, eps = 1e-15):
    actual = np.array(actual)
    predict = np.array(predict)
    clipped_predict = np.minimum(np.maximum(predict, eps), 1 - eps)
    
    loss = actual * np.log(clipped_predict) + (1 - actual) * np.log(1 - clipped_predict)
    return -1.0 * loss.mean()

## Solving Problems

In [106]:
num_epoch = 300
learning_rate = 1.0

w1 = np.random.uniform(low = -0.6, high = 0.6, size = (2000, 784)) # (num_nodes, num_features)
b1 = np.random.uniform(low = -0.6, high = 0.6, size = (2000, 1)) # (num_nodes, 1)
w2 = np.random.uniform(low = -0.6, high = 0.6, size = (10, 2000)) # (num_labels, num_nodes)
b2 = np.random.uniform(low = -0.6, high = 0.6, size = (10, 1)) # (num_labels, 1)

num_data = X_train_f.shape[1]

for epoch in range(num_epoch):
    z1 = np.dot(w1, X_train_f) + b1
    a1 = sigmoid(z1)
    z2 = np.dot(w2, a1) + b2
    a2 = sigmoid(z2)

    y_predict_hot = a2
    y_predict = np.argmax(a2, axis = 0)
    accuracy = (y_predict == y_train_f).mean()
    
    if accuracy > 0.98: break
    
    loss = cross_entropy(y_train_hot, y_predict_hot)
    if epoch % 10 == 0:
        print("{0:2} accuracy = {1:.5f}, loss = {2:.5f}".format(epoch, accuracy, loss))
    
    d2 = y_predict_hot - y_train_hot
    d1 = np.dot(w2.T, d2) * a1 * (1 - a1)  # dsigmoid(a1) = a1 * (1 - a1)

    w2 = w2 - learning_rate * np.dot(d2, a1.T) / num_data
    w1 = w1 - learning_rate * np.dot(d1, X_train_f.T) / num_data

    b2 = b2 - learning_rate * d2.mean(axis = 1, keepdims = True)
    b1 = b1 - learning_rate * d1.mean(axis = 1, keepdims = True)


print("----" * 10)
print("{0:2} accuracy = {1:.5f}, loss = {2:.5f}".format(epoch, accuracy, loss))

  This is separate from the ipykernel package so we can avoid doing imports until


 0 accuracy = 0.07595, loss = 2.53042
10 accuracy = 0.57795, loss = 1.00794
20 accuracy = 0.84152, loss = 0.20508
30 accuracy = 0.87327, loss = 0.15224
40 accuracy = 0.89092, loss = 0.12527
50 accuracy = 0.90408, loss = 0.10790
60 accuracy = 0.91208, loss = 0.09591
70 accuracy = 0.91932, loss = 0.08662
80 accuracy = 0.92429, loss = 0.07987
90 accuracy = 0.92940, loss = 0.07426
100 accuracy = 0.93250, loss = 0.07001
110 accuracy = 0.93649, loss = 0.06596
120 accuracy = 0.93976, loss = 0.06267
130 accuracy = 0.94122, loss = 0.05998
140 accuracy = 0.94357, loss = 0.05743
150 accuracy = 0.94554, loss = 0.05532
160 accuracy = 0.94708, loss = 0.05313
170 accuracy = 0.94836, loss = 0.05152
180 accuracy = 0.94970, loss = 0.04998
----------------------------------------
181 accuracy = 0.95018, loss = 0.04998


### Model Evaluation(train set)

In [107]:
z1 = np.dot(w1, X_train_f) + b1
a1 = sigmoid(z1)
z2 = np.dot(w2, a1) + b2
a2 = sigmoid(z2)

y_predict = np.argmax(a2, axis = 0)
accuracy = (y_train_f == y_predict).mean()
print(f"accuracy = {accuracy:.5f}")

  This is separate from the ipykernel package so we can avoid doing imports until


accuracy = 0.95018


## Model Evaluation(validation set)

In [108]:
z1 = np.dot(w1, X_valid_f) + b1
a1 = sigmoid(z1)
z2 = np.dot(w2, a1) + b2
a2 = sigmoid(z2)

y_predict = np.argmax(a2, axis = 0)
accuracy = (y_valid_f == y_predict).mean()
print(f"accuracy = {accuracy:.5f}")

accuracy = 0.90345


  This is separate from the ipykernel package so we can avoid doing imports until


### Predict Test Data set

In [109]:
z1 = np.dot(w1, test.T) + b1
a1 = sigmoid(z1)
z2 = np.dot(w2, a1) + b2
a2 = sigmoid(z2)

y_predict = np.argmax(a2, axis = 0)


  This is separate from the ipykernel package so we can avoid doing imports until


In [79]:
y_predict.shape, submission.shape

((28000,), (28000, 1))

In [110]:
submission = pd.read_csv('MNIST/sample_submission.csv', index_col = 'ImageId')
submission['Label'] = y_predict
print(submission.shape)
submission.to_csv('submission.csv')

(28000, 1)
