### 문제

- L2 regularization을 비용함수(compute_cost 내에)에 포함시키고 gradient 계산에(batch_gd 내에) 반영하세요.

- Regularization을 위한 가중치 lambda를 튜닝해보세요. 이것을 위해서 학습데이터의 일부를 validation data로 따로 구분하고 이 validation data에 대한 accuracy를 최적화하는 lambda를 찾도록 하는 코드를 구현해보세요.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]

In [4]:
y = y.astype(np.uint8)

In [5]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [6]:
enc.fit(y[:,np.newaxis])

  enc.fit(y[:,np.newaxis])


OneHotEncoder()

In [7]:
Y = enc.transform(y[:,np.newaxis]).toarray()

  Y = enc.transform(y[:,np.newaxis]).toarray()


In [8]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [9]:
X_train = X_train / 255
X_test = X_test / 255

In [10]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [11]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [21]:
def compute_cost(X, T, W, L=0):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1))
    l2_reg = L * np.sum(np.square(W)) / (2 * N)  # L2 regularization
    return cost + l2_reg

In [22]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [24]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size, L=0, random_state=None, output=True):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    if random_state is not None:
        np.random.seed(random_state)
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    #batch gradient descent
    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        # W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        W = W * (1 - learning_rate * L / batch_size) \
                - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch)) # 동시에 모든 w가 업데이트된다.
        cost_history[i] = compute_cost(X_batch, T_batch, W, L)
        if output and i % 1000 == 0:
            print(cost_history[i][0])

    return (cost_history, W)

In [25]:
# 규제화 전
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

initial_cost = compute_cost(X, T, W)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64)

Initial Cost is: 2.3024850979937352 

2.266748970882968
0.5400978467531193
0.49350895972261355
0.6335910549286129
0.5179386903190486
0.39343459293109573
0.27158959072596006
0.34270031343106944
0.3392035463570107
0.20529268551273083
0.4437027700265992
0.29894261067380845
0.40433897856135026
0.19009442209743127
0.20566270247645058
0.2740145117348238
0.31500076889654005
0.31185340483735846
0.3623704128575332
0.2544591524695475
0.2485276643579266
0.18499519326938618
0.18137318227847146
0.30263807229628026
0.195708340245924
0.20684270932558046
0.27570650053396606
0.2159668002074422
0.30631827256073485
0.19851437911668196
0.21886011688795737
0.17155665320261954
0.3121909882427808
0.27075991900651103
0.2692022032336735
0.4534745516145777
0.15715881610349342
0.4883679996264292
0.3759676611219625
0.299828403462353
0.18024565596879205
0.26687640991605915
0.26408092228484176
0.3896181832296417
0.21708487574307514
0.2059124875219347
0.3743579788422275
0.11082518020745857
0.14860297843744769
0.1572

In [28]:
# 규제화
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01
L = 0.01

initial_cost = compute_cost(X, T, W, L)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history_with_reg, W_reg_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, L, random_state=210613)

Initial Cost is: 2.3024850979937352 

2.281596456462674
0.45036031171500956
0.36421365194549976
0.45171165106043915
0.3796137362628895
0.5665716677800248
0.40514578721139827
0.44531476824016913
0.2415916442078665
0.3163976990316153
0.20979593966929683
0.18360640740087217
0.346179196019251
0.08094083981645207
0.23250496316265254
0.264800530807613
0.4316783702506057
0.29172080452456545
0.4710830201259775
0.14807698092735447
0.36935958323104895
0.5040785906007762
0.409769368401717
0.2671081658786581
0.35456603712789153
0.12317469962365907
0.2371393397282384
0.40803480743724563
0.20909513086710643
0.1720757437102052
0.2512778325244281
0.16947396928572217
0.30270770732859853
0.2883264480423924
0.1982483400832251
0.4467052632141238
0.26591797168399206
0.13453931380247425
0.41467851604277006
0.31235410058681606
0.16491264792151467
0.3037857601653567
0.2456135098477715
0.3367021448088752
0.3950400462809416
0.2527336384044906
0.3201201464768421
0.17783799868556752
0.19666759147727966
0.28480863

In [38]:
def accuracy_score(y_targ, y_pred):
    return (y_targ == y_pred).sum()/ len(y_targ)

In [40]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
y_reg_pred = predict(X_, W_reg_optimal)
y_targ = np.argmax(T_, axis=1)

# score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))
score1 = accuray_score(y_targ, y_pred)
score2 = accuray_score(y_targ, y_reg_pred)

print(score1, score2)

0.9174 0.9155


In [41]:
def train_val_split(X, Y, valid_data_ratio=0.2, random_state=None):
    if random_state is not None: # random_state가 변경되지 않는다면 같은 난수발생시키기 위함
        np.random.seed(random_state)
    
    N = len(X)
    val_size = int(N * valid_data_ratio)
    train_size = N - val_size
    
    train_idx = np.random.choice(N, train_size, replace=False)
    val_idx = np.setdiff1d(np.arange(N), train_idx)
    
    return X[train_idx], X[val_idx], Y[train_idx], Y[val_idx]

In [44]:
# 최적화 lambda 찾기
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

X_train2, X_val, y_train2, y_val = train_val_split(X, T, random_state=210613)
y_targ = np.argmax(y_val, axis=1)

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M, K))

iterations = 5000
learning_rate = 0.01

lambdas = np.logspace(-4, -1, 20)
lambdas = np.insert(lambdas, 0, 0)
result = []

for L in lambdas:
    _, W_opt = batch_gd(X_train2, y_train2, W, learning_rate, iterations, 64, L, random_state=210613, output=False)
    
    y_pred = predict(X_val, W_opt)
    score = accuracy_score(y_targ, y_pred)
    result.append([L, score])
    print(f"lambda = {L}, accuracy = {score}")

max_lambda, max_score = max(result, key=lambda x: x[1])

print()
print(f"{max_lambda = }, {max_score = }")  # python >= 3.8

lambda = 0.0, accuracy = 0.8718333333333333
lambda = 0.0001, accuracy = 0.8718333333333333
lambda = 0.0001438449888287663, accuracy = 0.8718333333333333
lambda = 0.00020691380811147902, accuracy = 0.8718333333333333
lambda = 0.00029763514416313193, accuracy = 0.8718333333333333
lambda = 0.00042813323987193956, accuracy = 0.8718333333333333
lambda = 0.0006158482110660267, accuracy = 0.8718333333333333
lambda = 0.0008858667904100823, accuracy = 0.8718333333333333
lambda = 0.0012742749857031334, accuracy = 0.87175
lambda = 0.0018329807108324356, accuracy = 0.87175
lambda = 0.0026366508987303583, accuracy = 0.87175
lambda = 0.00379269019073225, accuracy = 0.87175
lambda = 0.005455594781168515, accuracy = 0.8718333333333333
lambda = 0.007847599703514606, accuracy = 0.8718333333333333
lambda = 0.011288378916846883, accuracy = 0.8718333333333333
lambda = 0.01623776739188721, accuracy = 0.87175
lambda = 0.023357214690901212, accuracy = 0.87175
lambda = 0.03359818286283781, accuracy = 0.8716666