In [93]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [94]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [95]:
X, y = mnist["data"], mnist["target"]

In [96]:
y = y.astype(np.uint8)

In [97]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [98]:
enc.fit(y[:,np.newaxis])

  """Entry point for launching an IPython kernel.


OneHotEncoder()

In [99]:
Y = enc.transform(y[:,np.newaxis]).toarray()

  """Entry point for launching an IPython kernel.


In [100]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [101]:
X_train = X_train / 255
X_test = X_test / 255

In [102]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [103]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [104]:
def compute_cost(X, T, W, l2=1):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1)) + 0.5*l2*(np.sum(W**2))
    return cost

In [105]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [110]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size, l2):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        #W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch) + l2*W) 
        cost_history[i] = compute_cost(X_batch, T_batch, W, l2)
        #if i % 1000 == 0:
        #    print(cost_history[i][0])

    return (cost_history, W)

In [107]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [112]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

l2s = np.logspace(-1, -5, num=10)

initial_cost = compute_cost(X, T, W)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))
max_score = 0
max_l2 = 0
for l2 in l2s:
    (cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, l2)
    X_ = np.hstack((np.ones((np.size(X_valid, 0),1)),X_valid))
    T_ = y_valid
    y_pred = predict(X_, W_optimal)
    score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_valid))
    print(f'l2 is {l2}, score is {score}')
    
    if max_score < score:
        max_score = score
        max_l2 = l2


Initial Cost is: 2.302485097993479 

l2 is 0.1, score is 0.9065833333333333
l2 is 0.03593813663804628, score is 0.9101666666666667
l2 is 0.01291549665014884, score is 0.91
l2 is 0.004641588833612782, score is 0.9114166666666667
l2 is 0.0016681005372000592, score is 0.9146666666666666
l2 is 0.0005994842503189409, score is 0.9110833333333334
l2 is 0.00021544346900318845, score is 0.91225
l2 is 7.742636826811278e-05, score is 0.9115
l2 is 2.782559402207126e-05, score is 0.9154166666666667
l2 is 1e-05, score is 0.9116666666666666


In [113]:
(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, max_l2)

## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))

print(score)

0.9052
