In [1]:
#### DO NOT MODIFY THIS CELL ####
#################################

import os, gzip, urllib.request, warnings
import numpy as np

FILES = {
    "train-images-idx3-ubyte.gz": "X_train",
    "train-labels-idx1-ubyte.gz": "y_train",
    "t10k-images-idx3-ubyte.gz" : "X_test",
    "t10k-labels-idx1-ubyte.gz" : "y_test",
}

MIRRORS = [
    "https://storage.googleapis.com/cvdf-datasets/mnist/",
    "https://yann.lecun.com/exdb/mnist/",
]

def download_mnist():
    for fname in FILES:
        if os.path.exists(fname):
            continue
        for base in MIRRORS:
            url = base + fname
            try:
                print(f"Downloading {fname} from {url} ...")
                urllib.request.urlretrieve(url, fname)
                break
            except urllib.error.HTTPError:
                warnings.warn(f"{url} 404 – trying next mirror…")
        else:
            raise RuntimeError(f"{fname} could not be downloaded from any mirror.")


def load_idx(filename, offset, shape):
    with gzip.open(filename, "rb") as f:
        return np.frombuffer(f.read(), np.uint8, offset=offset).reshape(shape)


try:
    download_mnist()
    # 28×28 이미지 → 784 벡터
    X_train = load_idx("train-images-idx3-ubyte.gz", 16, (-1, 28*28))
    y_train = load_idx("train-labels-idx1-ubyte.gz", 8,  (-1,))
    X_test  = load_idx("t10k-images-idx3-ubyte.gz",  16, (-1, 28*28))
    y_test  = load_idx("t10k-labels-idx1-ubyte.gz",  8,  (-1,))
except Exception as e:
    warnings.warn(f"MNIST download failed ({e}). Falling back to openml…")
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml("mnist_784", version=1, as_frame=False)
    X = mnist["data"].astype(np.float32) / 255.0
    y = mnist["target"].astype(np.int8)
    X_train, X_test = X[:60000], X[60000:]
    y_train, y_test = y[:60000], y[60000:]

X_train = X_train.astype(np.float32) / 255.0
X_test  = X_test.astype(np.float32) / 255.0

num_classes = 10
Y_train = np.eye(num_classes)[y_train]
Y_test  = np.eye(num_classes)[y_test]

print("Training set:", X_train.shape, Y_train.shape)
print("Test set    :", X_test.shape,  Y_test.shape)


Downloading train-images-idx3-ubyte.gz from https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz ...
Downloading train-labels-idx1-ubyte.gz from https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz ...
Downloading t10k-images-idx3-ubyte.gz from https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz ...
Downloading t10k-labels-idx1-ubyte.gz from https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz ...
Training set: (60000, 784) (60000, 10)
Test set    : (10000, 784) (10000, 10)


In [2]:
def initialize_parameters(input_dim, hidden_dim, output_dim):
    """
    Question (a)

    Initialize the neural network's parameters:
    - W1: weight matrix of shape (input_dim, hidden_dim)
    - b1: bias vector of shape (hidden_dim,)
    - W2: weight matrix of shape (hidden_dim, output_dim)
    - b2: bias vector of shape (output_dim,)
    Returns a dictionary containing W1, b1, W2, b2.
    """
    ##### YOUR CODE #####

    W1 = np.random.randn(input_dim, hidden_dim) * 0.01
    b1 = np.zeros(hidden_dim)
    W2 = np.random.randn(hidden_dim, output_dim) * 0.01
    b2 = np.zeros(output_dim)

    #####################
    # Tip: When initializing W, use np.random.randn for random initialization and multiply by a small factor (e.g., 0.01)

    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
    return parameters


In [3]:
def softmax(logits):
    """
    Question (b)

    Compute the softmax of each row of the input array.
    logits: NumPy array of shape (N, K) or (K,) representing raw scores.
    Returns: NumPy array of same shape, with softmax probabilities.
    """
    ##### YOUR CODE #####

    if logits.ndim == 1:
        z = logits - np.max(logits)
        exp_z = np.exp(z)
        probs = exp_z / np.sum(exp_z)

    elif logits.ndim == 2:
        z = logits - np.max(logits, axis=1, keepdims=True)
        exp_z = np.exp(z)
        probs = exp_z / np.sum(exp_z, axis=1, keepdims=True)

    ####################
    # Tip: for numeric stability, subtract the max logit from each logit before exponentiating.

    return probs


In [4]:
def forward(X, params):
    """
    Question (c)

    Perform the forward pass.
    X: input batch of shape (N, D)
    params: dictionary containing W1, b1, W2, b2
    Returns:
      Y_hat: softmax output probabilities (shape (N, K))
      cache: dictionary of intermediate values (z1, a1, z2) for use in backprop.
    """
    W1, b1, W2, b2 = params["W1"], params["b1"], params["W2"], params["b2"]

    ##### YOUR CODE #####

    z1 = X @ W1 + b1

    a1 = np.maximum(0, z1)

    z2 = a1 @ W2 + b2

    Y_hat = softmax(z2)

    ####################

    # Store intermediate values in cache for backward pass
    cache = {"X": X, "z1": z1, "a1": a1, "z2": z2, "Y_hat": Y_hat}
    return Y_hat, cache


In [5]:
def compute_loss(Y_hat, Y_true, eps = 1e-15):
    """
    Question (d)

    Compute the average cross-entropy loss.
    Y_hat: predicted probabilities, shape (N, K)
    Y_true: true labels (one-hot vectors or class indices), shape (N, K) or (N,)
    Returns: scalar loss (average cross-entropy over N examples).
    """

    # Tip: if Y_true is one-hot, multiply element-wise with log(Y_hat) and sum.
    # If Y_true is label indices, use those to index into Y_hat.
    # Add a small value (e.g., 1e-15) to Y_hat before taking log to avoid log(0).

    ##### YOUR CODE #####
    n = Y_hat.shape[0]

    if Y_true.ndim == 2:
        log_probs = np.log(Y_hat + eps)
        cross_entropy = -np.sum(Y_true * log_probs, axis=1)

        loss = np.mean(cross_entropy)


    elif Y_true.ndim == 1:
        log_probs = np.log(Y_hat[np.arange(n), Y_true] + eps)
        cross_entropy = -log_probs

        loss = np.mean(cross_entropy)

    #####################

    return loss

In [6]:
def backward(X, Y_true, params, cache):
    """
    Question (e)

    Perform the backward pass to compute gradients.
    X: input batch, shape (N, D)
    Y_true: true labels (one-hot vectors or class indices), shape (N, K) or (N,)
    params: dictionary of parameters (W1, b1, W2, b2)
    cache: dictionary of intermediate values from forward pass
    Returns: dictionary of gradients dW1, db1, dW2, db2.
    """
    W1, b1, W2, b2 = params["W1"], params["b1"], params["W2"], params["b2"]
    z1, a1, Y_hat = cache["z1"], cache["a1"], cache["Y_hat"]
    N = X.shape[0]

    if Y_true.ndim == 1: # label -> onehot 변환
        Y_true_onehot = np.zeros_like(Y_hat)
        Y_true_onehot[np.arange(N), Y_true] = 1
    else:
        Y_true_onehot = Y_true

    ##### YOUR CODE #####
    dZ2 = (Y_hat - Y_true_onehot) / N

    dW2 = a1.T @ dZ2
    db2 = np.sum(dZ2, axis=0)

    dA1 = dZ2 @ W2.T

    dZ1 = dA1.copy()
    dZ1[z1 <= 0] = 0

    dW1 = X.T @ dZ1
    db1 = np.sum(dZ1, axis=0)

    #####################
    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}
    return grads

In [7]:
"""
Question (f)
"""

# ---------------- Hyper-parameters ----------------
# 필요한 하이퍼파라미터들을 자유롭게 추가/변경하세요

learning_rate = 0.03
num_epochs    = 30
hidden_dim    = 64     # number of hidden units

batch_size = 128
# ---------------------------------------------------

# Initialize parameters
params = initialize_parameters(X_train.shape[1], hidden_dim, 10)

N = X_train.shape[0]   # 60 000 examples

for epoch in range(1, num_epochs + 1):

    ##### YOUR CODE #####
    idx = np.random.permutation(N)
    X_train_shuffled = X_train[idx]
    Y_train_shuffled = Y_train[idx]

    epoch_loss = 0

    for i in range(0, N, batch_size):
        X_batch = X_train_shuffled[i:i+batch_size]
        Y_batch = Y_train_shuffled[i:i+batch_size]

        Y_hat, cache = forward(X_batch, params)
        loss = compute_loss(Y_hat, Y_batch)
        grads = backward(X_batch, Y_batch, params, cache)

        params["W1"] -= learning_rate * grads["dW1"]
        params["b1"] -= learning_rate * grads["db1"]
        params["W2"] -= learning_rate * grads["dW2"]
        params["b2"] -= learning_rate * grads["db2"]

        epoch_loss += loss * X_batch.shape[0]

    epoch_loss /= N
    print(f"Epoch {epoch}/{num_epochs}, Loss: {epoch_loss:.4f}")

    #####################


Epoch 1/30, Loss: 1.6103
Epoch 2/30, Loss: 0.5393
Epoch 3/30, Loss: 0.3998
Epoch 4/30, Loss: 0.3511
Epoch 5/30, Loss: 0.3232
Epoch 6/30, Loss: 0.3029
Epoch 7/30, Loss: 0.2869
Epoch 8/30, Loss: 0.2728
Epoch 9/30, Loss: 0.2603
Epoch 10/30, Loss: 0.2491
Epoch 11/30, Loss: 0.2391
Epoch 12/30, Loss: 0.2301
Epoch 13/30, Loss: 0.2216
Epoch 14/30, Loss: 0.2138
Epoch 15/30, Loss: 0.2066
Epoch 16/30, Loss: 0.1998
Epoch 17/30, Loss: 0.1934
Epoch 18/30, Loss: 0.1876
Epoch 19/30, Loss: 0.1819
Epoch 20/30, Loss: 0.1766
Epoch 21/30, Loss: 0.1716
Epoch 22/30, Loss: 0.1668
Epoch 23/30, Loss: 0.1623
Epoch 24/30, Loss: 0.1581
Epoch 25/30, Loss: 0.1540
Epoch 26/30, Loss: 0.1502
Epoch 27/30, Loss: 0.1464
Epoch 28/30, Loss: 0.1429
Epoch 29/30, Loss: 0.1394
Epoch 30/30, Loss: 0.1361


In [8]:
"""
Question (g)
"""
# Evaluate on test set

Y_hat_test, _ = forward(X_test, params)
test_predictions = np.argmax(Y_hat_test, axis=1)
test_targets = y_test  # original labels
test_accuracy = np.mean(test_predictions == test_targets) * 100
print(f"Test Accuracy: {test_accuracy:.2f}%")

###################################################
##### Important: Clearly state the test accuracy values produced by your code.
##### Your test accuracy: 95.92%
###################################################

Test Accuracy: 95.92%
