# 모델 학습 원리 이해 ( 경사하강법 Gradient Descent )

In [19]:
import numpy as np

In [24]:
def sigmoid(x: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-x))


def relu(x: np.ndarray) -> np.ndarray:
    return x * (x > 0)  # max(x,0)


def leaky_relu(x: np.ndarray, alpha: float = 0.01) -> np.ndarray:
    return np.where(x > 0, x, x * alpha)


def post_processing(predictions: np.ndarray) -> np.ndarray:
    return np.where(predictions < 0.5, 0, 1)


def display_results(inputs: np.ndarray, predictions: np.ndarray) -> None:
    processed_predictions = post_processing(predictions)
    print("Input (A, B) | Predicted Y ")
    print("----------------------------")
    for i in range(inputs.shape[1]):
        print(f"    {inputs[0, i]}, {inputs[1, i]} | {processed_predictions[0, i]}")


def initialize_parameters() -> dict[str, np.ndarray]:
    parameters = {
        "W1": np.random.randn(2, 2),  # 가중치 input layer 2 -> hidden layer 2
        "b1": np.zeros((2, 1)),  # hidden layer 2
        "W2": np.random.randn(1, 2),  # 가중치 hidden layer 2 -> output layer 1
        "b2": np.zeros((1, 1))  # output layer 1
    }
    return parameters


def compute_loss(Y: np.ndarray, Y_hat: np.ndarray) -> np.ndarray:
    # BCE (Binary Cross Entropy) 
    m = Y.shape[0]
    loss = -np.sum(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat)) / m
    return loss


def forward_propagation(
        X: np.ndarray,
        parameters: dict[str, np.ndarray],
) -> tuple[np.ndarray, np.ndarray]:
    # 가중치와 편향 추출
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]

    # 입력층에서 히든레이어까지 연산
    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)

    # 히든레이어에서 출력층까지의 연산
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2)

    return A1, A2


def backword_propagation(
        parameters: dict[str, np.ndarray],
        A1: np.ndarray,
        A2: np.ndarray,
        X: np.ndarray,
        Y: np.ndarray,
) -> dict[str, np.ndarray]:
    m = X.shape[1]
    W2 = parameters["W2"]

    dZ2 = (A2 - Y) * A2 * (1 - A2)
    dW2 = np.dot(dZ2, A1.T) / m
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m
    dZ1 = np.dot(W2.T, dZ2) * (A1 > 0)
    dW1 = np.dot(dZ1, X.T) / m
    db1 = np.sum(dZ1, axis=1, keepdims=True) / m

    gradients = {
        "dW1": dW1,
        "db1": db1,
        "dW2": dW2,
        "db2": db2,
    }
    return gradients


def update_parameters(
        parameters: dict[str, np.ndarray],
        grads: dict[str, np.ndarray],
        learning_rate: float = 0.01,
) -> dict[str, np.ndarray]:
    parameters["W1"] -= learning_rate * grads["dW1"]
    parameters["b1"] -= learning_rate * grads["db1"]
    parameters["W2"] -= learning_rate * grads["dW2"]
    parameters["b2"] -= learning_rate * grads["db2"]

    return parameters


inputs = np.array([[0,0], [0,1], [1,0], [1,1]]).T
outputs = np.array([0,1,1,0])

parameters = initialize_parameters()
predictions = forward_propagation(inputs, parameters)[1]

display_results(inputs, predictions)



Input (A, B) | Predicted Y 
----------------------------
    0, 0 | 1
    0, 1 | 0
    1, 0 | 1
    1, 1 | 1


In [26]:
for i in range(200000):
    A1, A2 = forward_propagation(inputs, parameters)
    grads = backword_propagation(parameters, A1, A2, inputs, outputs)
    parameters = update_parameters(parameters, grads)
    loss = compute_loss(outputs, A2)
    
    if i > 0 and i % 10000 == 0:
        print(f"{i=}, {loss=}")
    
predicted_outputs = forward_propagation(inputs, parameters)[1]
print(predicted_outputs)
display_results(inputs, predicted_outputs)

i=10000, loss=0.6931471805599453
i=20000, loss=0.6931471805599453
i=30000, loss=0.6931471805599453
i=40000, loss=0.6931471805599453
i=50000, loss=0.6931471805599453
i=60000, loss=0.6931471805599453
i=70000, loss=0.6931471805599453
i=80000, loss=0.6931471805599453
i=90000, loss=0.6931471805599453
i=100000, loss=0.6931471805599453
i=110000, loss=0.6931471805599453
i=120000, loss=0.6931471805599453
i=130000, loss=0.6931471805599453
i=140000, loss=0.6931471805599453
i=150000, loss=0.6931471805599453
i=160000, loss=0.6931471805599453
i=170000, loss=0.6931471805599453
i=180000, loss=0.6931471805599453
i=190000, loss=0.6931471805599453
[[0.5 0.5 0.5 0.5]]
Input (A, B) | Predicted Y 
----------------------------
    0, 0 | 0
    0, 1 | 1
    1, 0 | 1
    1, 1 | 1


In [27]:
def forward_propagation_for_debuging(X, parameters) -> tuple[np.ndarray, np.ndarray]:
    # 가중치와 편향 추출
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]

    print(f"{X=}")
    print(f"{W1=}")
    print(f"{b1=}")
    print(f"{W2=}")
    print(f"{b2=}")

    # 입력층에서 히든레이어까지의 연산
    Z1 = np.dot(W1, X) + b1
    print(f"{Z1=}")
    A1 = relu(Z1)
    print(f"{A1=}")

    # 히든레이어에서 출력층까지의 연산
    Z2 = np.dot(W2, A1) + b2
    print(f"{Z2=}")
    A2 = sigmoid(Z2)
    print(f"{A2=}")

    return A1, A2

In [28]:
predicted_outputs = forward_propagation_for_debuging(inputs, parameters)[1]
predicted_outputs

X=array([[0, 0, 1, 1],
       [0, 1, 0, 1]])
W1=array([[0.08204067, 1.27191402],
       [0.18302969, 0.29126595]])
b1=array([[0.00330639],
       [0.06475702]])
W2=array([[-5.49039980e-16,  2.51708376e-15]])
b2=array([[-5.27355937e-16]])
Z1=array([[0.00330639, 1.27522041, 0.08534707, 1.35726108],
       [0.06475702, 0.35602297, 0.24778671, 0.53905266]])
A1=array([[0.00330639, 1.27522041, 0.08534707, 1.35726108],
       [0.06475702, 0.35602297, 0.24778671, 0.53905266]])
Z2=array([[-3.66172432e-16, -3.31363292e-16,  4.94850199e-17,
         8.42941592e-17]])
A2=array([[0.5, 0.5, 0.5, 0.5]])


array([[0.5, 0.5, 0.5, 0.5]])