In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()


dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]

In [4]:
y = y.astype(np.uint8)

In [5]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [6]:
enc.fit(y[:,np.newaxis])

OneHotEncoder()

In [7]:
Y = enc.transform(y[:,np.newaxis]).toarray()

In [8]:
# hold out validation
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [9]:
X_train = X_train / 255
X_test = X_test / 255

In [10]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [11]:
def softmax(X, W):
    K = np.size(W, axis=1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [12]:
# understaning softmax function
X = np.array([
    [1, 1, 1], 
    [2, 2, 2], 
    [3, 3, 3]
])
W = np.array([
    [0.5, 1, 2, 5],
    [1, 2, 3, 4], 
    [0.3, 1, 2, 3]
])

np.around(softmax(X, W), 2)

array([[0.  , 0.  , 0.01, 0.99],
       [0.  , 0.  , 0.  , 1.  ],
       [0.  , 0.  , 0.  , 1.  ]])

In [13]:
#####

In [14]:
def compute_cost(X, T, W, lamb):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    # cost1: the sum of the all elements
    cost1 = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1))
    cost2 = 1/(2*N) * (lamb * np.linalg.norm(W, ord='fro'))  ## add
    cost = cost1+cost2
    return cost

In [15]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [16]:
def batch_gd(X, T, W, lamb, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch) + lamb * W)
#         cost_history[i] = compute_cost(X_batch, T_batch, W, lamb)
#         if i % 1000 == 0:
#             print(cost_history[i][0])

    return W        #  (cost_history, W)

In [17]:
def fit_predict_by_lambda(lamb):
    X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
    T = y_train

    K = np.size(T, 1)
    M = np.size(X, 1)
    W = np.zeros((M,K))

    iterations = 50000
    learning_rate = 0.01

    # initial_cost = compute_cost(X, T, W, lamb)

    # print("Initial Cost is: {} \n".format(initial_cost[0][0]))

    W_optimal = batch_gd(X, T, W, lamb, learning_rate, iterations, 64)
    
    ## Accuracy
    X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
    T_ = y_test
    y_pred = predict(X_, W_optimal)
    val_acc = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))

    print('-' * 30)
    print('lambda:', lamb)
    print('val_acc:', val_acc)
    print('-' * 30)
    return val_acc, lamb

In [18]:
lambs = np.random.uniform(low=0, high=1, size=50)
best_acc = 0
best_lamb = 0

for lamb in lambs:
    tem_acc, tem_lamb = fit_predict_by_lambda(lamb)
    if tem_acc > best_acc:
        best_lamb = tem_lamb
        best_acc = tem_acc

------------------------------
lambda: 0.7684596415095928
val_acc: 0.89
------------------------------
------------------------------
lambda: 0.7202291450471682
val_acc: 0.8858
------------------------------
------------------------------
lambda: 0.6301098410615177
val_acc: 0.8917
------------------------------
------------------------------
lambda: 0.8054972548023436
val_acc: 0.8819
------------------------------
------------------------------
lambda: 0.3315271046160003
val_acc: 0.9058
------------------------------
------------------------------
lambda: 0.5054354245752756
val_acc: 0.8961
------------------------------
------------------------------
lambda: 0.45167305857787643
val_acc: 0.8975
------------------------------
------------------------------
lambda: 0.785832210111468
val_acc: 0.8992
------------------------------
------------------------------
lambda: 0.651244758360795
val_acc: 0.9008
------------------------------
------------------------------
lambda: 0.9730105394584231


In [19]:
print('-' * 40)
print('best lambda:', best_lamb)
print('best accurcy:', best_acc)
print('-' * 40)

----------------------------------------
best lambda: 0.0007938857090212803
best accurcy: 0.9174
----------------------------------------
