<a href="https://colab.research.google.com/github/Limm-jk/2020_Fall_MachineLearning/blob/master/Mnist_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from mlxtend.data import loadlocal_mnist

X, y = loadlocal_mnist(images_path='/content/drive/My Drive/Colab Notebooks/train-images.idx3-ubyte', labels_path='/content/drive/My Drive/Colab Notebooks/train-labels.idx1-ubyte')
y.reshape(60000,1)
print(X.shape)
print(y.shape)

(60000, 784)
(60000,)


In [2]:
# 타입확인
print(type(X[1,1]))

<class 'numpy.uint8'>


In [3]:
# Normalize
X = X/255
print(type(X[1,1]))
print(X[1,400:405])

<class 'numpy.float64'>
[0.99215686 0.74509804 0.         0.         0.        ]


In [4]:
# 조건에 맞게 필터링
y_f = []
f_index = []
for j in range(len(y)):
  i = y[j]
  if i == 1 or i == 5 or i == 8:
    y_f.append(i)
    f_index.append(j)
X_f = []
for j in f_index:
  X_f.append(X[j])

print(len(X_f))
# print(X_f[:6])

18014


In [5]:
X = np.array(X_f)
y = np.array(y_f)

In [6]:
print(X.shape)
print(y.shape)

(18014, 784)
(18014,)


In [8]:
# One-Hot Encoding
# y = y.reshape(18014,1)
y_z = np.zeros((18014,10))

for i in range(18014):
  y_z[i,y[i]] = 1
y = y_z
print(y.shape)
print(y[:3])

(18014, 10)
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [9]:
X_train = X.T[:,:10808]
X_test = X.T[:,10808:]
Y_train = y.T[:,:10808]
Y_test = y.T[:,10808:]
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test)

(784, 10808)
(784, 7206)
(10, 10808)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 1. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
def layer_sizes(X, y):
    n_x = X.shape[0] # 입력층 사이즈
    n_h = 300 # 히든레이어 그냥 300으로 해봄
    n_y = y.shape[0] # 출력 사이즈
    return (n_x, n_h, n_y)
print(layer_sizes(X,y))

(18014, 300, 18014)


In [11]:
def initialize_parameters(n_x, n_h, n_y):
    np.random.seed(1)
    # 작은 수로 할 때 시간적으로, 성능적으로 이득
    W1 = np.random.randn(n_h,n_x)*0.01
    b1 = np.zeros((n_h,1))
    W2 = np.random.randn(n_y,n_h)*0.01
    b2 = np.zeros((n_y,1))
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    return parameters

In [12]:
def linear_forward(A, W, b):
    Z = np.dot(W,A)+b
    # 이후 역전파를 위해 저장
    cache = (A, W, b)
    return Z, cache

In [13]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [14]:
def linear_activation_forward(A_prev, W, b, activation):
    if activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        activation_cache = Z
        A = sigmoid(Z)

    # ReLU의 prime함수를 못만들겠어요...

    # elif activation == "relu":
    #     Z, linear_cache = linear_forward(A_prev, W, b)
    #     activation_cache = Z
    #     A  = np.maximum(0,Z)
    
    # 역전파를 위하여 저장
    cache = (linear_cache, activation_cache)

    return A, cache

In [15]:
# 선형 역전파 / 활성함수 역전파 후 사용 
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = (1/m)*np.dot(dZ, A_prev.T)
    db = (1/m)*np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [16]:
def sigmoid_backward(dA, activation_cache):
    activation_prame = (sigmoid(activation_cache)*(1-sigmoid(activation_cache)))
    # 시그모이드의 프라임은 sigmoid(1-sigmoid)
    return dA*activation_prame

In [17]:
def linear_activation_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        # ReLU의 prime함수를 못만들겠어요...
        # dZ = relu_backward(dA, activation_cache)
        # dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

In [18]:
def compute_cost(AL, Y):
    
    m = Y.shape[1]

    cost = (-1/m)*np.sum((Y*np.log(AL))+(1-Y)*np.log(1-AL))
    
    cost = np.squeeze(cost)    
    
    return cost

In [20]:
def update_parameters(parameters, grads, learning_rate):
    
    L = len(parameters) // 2 

    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db" + str(l+1)]
    return parameters

In [24]:
def two_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
    
    np.random.seed(1)
    grads = {}
    costs = []                  
    m = X.shape[1]                     
    (n_x, n_h, n_y) = layers_dims
    
    parameters = initialize_parameters(n_x, n_h, n_y)
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]

    for i in range(0, num_iterations):
        A1, cache1 = linear_activation_forward(X, W1, b1, 'sigmoid')
        A2, cache2 = linear_activation_forward(A1, W2, b2, 'sigmoid')

        cost = compute_cost(A2, Y)

        dA2 = - (np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
        
        dA1, dW2, db2 = linear_activation_backward(dA2, cache2, 'sigmoid')
        dA0, dW1, db1 = linear_activation_backward(dA1, cache1, 'sigmoid')

        grads['dW1'] = dW1
        grads['db1'] = db1
        grads['dW2'] = dW2
        grads['db2'] = db2
        
        parameters = update_parameters(parameters, grads, learning_rate)

        W1 = parameters["W1"]
        b1 = parameters["b1"]
        W2 = parameters["W2"]
        b2 = parameters["b2"]
        if (i+1)%100 == 0:
          print(str(i+1)+"번째 학습 완료")
        if print_cost and (i+1)%500 == 0:
          print(str(i+1)+"번째 코스트 : "+str(cost))
          print("W1 : "+str(parameters["W1"]))
          print("b1 : "+str(parameters["b1"][:5]))
          print("W2 : "+str(parameters["W2"]))
          print("b2 : "+str(parameters["b2"][:5]))
    return parameters

In [25]:
parameters = two_layer_model(X_train, Y_train, layers_dims = layer_sizes(X_train, Y_train), num_iterations = 5000, print_cost=True)

100번째 학습 완료
200번째 학습 완료
300번째 학습 완료
400번째 학습 완료
500번째 학습 완료
500번째 코스트 : 1.8674377746030457
W1 : [[ 1.62434536e-02 -6.11756414e-03 -5.28171752e-03 ... -1.10657307e-02
  -3.59224096e-03  5.05381903e-03]
 [ 1.21794090e-02 -1.94068096e-02 -8.06178212e-03 ...  2.07229946e-03
  -1.43403073e-02  6.26906306e-03]
 [ 2.99825202e-03 -1.85664142e-02 -2.15104316e-02 ... -1.95419182e-02
   1.86223746e-03 -3.82994307e-05]
 ...
 [-1.62938256e-03 -7.73707322e-03  6.73184510e-03 ...  7.57211751e-03
  -2.95902222e-04 -1.68317819e-02]
 [ 2.68693851e-03  1.82752831e-02 -2.13568366e-03 ... -7.51133062e-03
  -1.55644795e-03 -9.82549221e-03]
 [-9.27244633e-03  1.48092056e-03  8.25829428e-03 ... -3.89627549e-03
   8.06949100e-03 -1.96658544e-02]]
b1 : [[0.00212782]
 [0.00216024]
 [0.00249707]
 [0.00193762]
 [0.00252317]]
W2 : [[-0.0306399  -0.0271863  -0.03529309 ... -0.03893391 -0.05066794
  -0.02766551]
 [-0.03895999  0.007696   -0.02629852 ... -0.00070647  0.02187259
   0.01228405]
 [-0.02581005 -0.02429906

In [33]:
_X = X_test
_W1 = parameters["W1"]
_b1 = parameters["b1"]
_W2 = parameters["W2"]
_b2 = parameters["b2"]
_A1, _cache1 = linear_activation_forward(_X, _W1, _b1, 'sigmoid')
_A2, _cache2 = linear_activation_forward(_A1, _W2, _b2, 'sigmoid')
# print(_A2.shape)
count = 0
for i in range(len(Y_test[1])):
  if Y_test[:,i].argmax() == _A2[:,i].argmax():
    count += 1
print("Test Set의 정확도는 "+str((count/len(Y_test[1]))*100)+"% 입니다.")

Test Set의 정확도는 93.65806272550653% 입니다.
6749


In [35]:
X = X_train
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
A1, cache1 = linear_activation_forward(X, W1, b1, 'sigmoid')
A2, cache2 = linear_activation_forward(A1, W2, b2, 'sigmoid')
count = 0
for i in range(len(Y_train[1])):
  if Y_train[:,i].argmax() == A2[:,i].argmax():
    count += 1
print("Train Set의 정확도는 "+str((count/len(Y_train[1]))*100)+"% 입니다.")

Train Set의 정확도는 93.8101406365655% 입니다.
