In [35]:
!unzip train.zip
!unzip test.zip

Archive:  train.zip
replace train/train_0.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: Archive:  test.zip
replace test/test_0.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [37]:
def onehot(y,num_classes):
    N = y.shape[0]
    y_onehot = np.zeros((N,num_classes),dtype="float32")
    y_onehot[np.arange(N),y] = 1.0
    return y_onehot


In [38]:
def softmax(x):
    x_shift = x - np.max(x,axis=1,keepdims=True)
    exp_x = np.exp(x_shift)
    return exp_x / np.sum(exp_x,axis=1,keepdims=True)

In [39]:
def cross_entropy(pred,target_onehot):
    eps = 1e-12
    pred_clipped = np.clip(pred, eps, 1.0 - eps)
    log_likehood = np.sum(target_onehot * np.log(pred_clipped), axis=1)
    return -np.mean(log_likehood)

In [40]:
def accuracy(pred, y_true):
    y_pred = np.argmax(pred, axis=1)
    return np.mean(y_pred == y_true)


In [41]:
class MLP3layer:
    def __init__(self, input_dim=784, hidden1=128, hidden2=64, output_dim=10, weight_scale=0.01, seed=42):
        rng = np.random.default_rng(seed)
        self.W1 = weight_scale * rng.standard_normal((input_dim, hidden1))
        self.b1 = np.zeros(hidden1,dtype="float32")

        self.W2 = weight_scale * rng.standard_normal((hidden1,hidden2))
        self.b2 = np.zeros(hidden2,dtype="float32")

        self.W3 = weight_scale * rng.standard_normal((hidden2,output_dim))
        self.b3 = np.zeros(output_dim,dtype="float32")


    def forward(self,X):
        z1 = X @ self.W1 + self.b1
        a1 = np.maximum(0,z1)

        z2 = a1 @ self.W2 + self.b2
        a2 = np.maximum(0,z2)

        z3 = a2 @ self.W3 + self.b3
        y_pred = softmax(z3)

        cache = (X, z1, a1, z2, a2, z3, y_pred)
        return y_pred, cache

    def backward(self,cache, y_true_onehot):
        X, z1, a1, z2, a2, z3, y_pred = cache
        N = X.shape[0]

        dz3 = (y_pred - y_true_onehot) / N

        dW3 = a2.T @ dz3
        db3 = dz3.sum(axis=0)

        da2 = dz3 @ self.W3.T
        dz2 = da2 * (z2 > 0)

        dW2 = a1.T @ dz2 # This was corrected from a2.T @ dz2 to a1.T @ dz2 previously
        db2 = dz2.sum(axis=0)

        da1 = dz2 @ self.W2.T
        dz1 = da1 * (z1 > 0)

        dW1 = X.T @ dz1 # This is the main correction
        db1 = dz1.sum(axis = 0)

        grads = {
            "W1":dW1, "b1":db1,
            "W2":dW2, "b2":db2,
            "W3":dW3, "b3":db3,
        }
        return grads

    def update(self, grads, lr=0.1, weight_decay=0.0):
        if weight_decay > 0.0:
            grads["W1"] += weight_decay * self.W1
            grads["W2"] += weight_decay * self.W2
            grads["W3"] += weight_decay * self.W3

        self.W1 -= lr * grads["W1"]
        self.b1 -= lr * grads["b1"]
        self.W2 -= lr * grads["W2"]
        self.b2 -= lr * grads["b2"]
        self.W3 -= lr * grads["W3"]
        self.b3 -= lr * grads["b3"]

    def predict_proba(self,x):
        y_pred, _ = self.forward(x)
        return y_pred

    def predict(self, x):
        y_pred, _ = self.forward(x)
        return np.argmax(y_pred, axis=1)

In [42]:
import glob
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
folder = "./train/*.jpg"
paths = sorted(glob.glob("./train/*.jpg"))
imgs = []
for path in paths:
    img = Image.open(path)
    imgs.append(img)
X = np.array(imgs).astype(np.float32)
print(X.shape)

(60000, 28, 28)


In [43]:
X = X.reshape(60000,-1)
X /= 255
np.max(X)

np.float32(1.0)

In [44]:
y = pd.read_csv("./train_master.csv")
y = y.sort_values("file_name")
y = y["category_id"].values
y = y.reshape(-1)
print(y.shape)

(60000,)


In [None]:
def train_mnist(
    num_epochs = 10,
    batch_size = 128,
    lr = 0.1,
    weight_decay = 1e-4
):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    num_classes = 10
    model = MLP3layer(input_dim=784, hidden1=128, hidden2=64, output_dim=num_classes)
    num_train = X_train.shape[0]
    num_batches = num_train // batch_size

    for epoch in range(1,num_epochs+1):

        indices = np.random.permutation(num_train)
        X_train_shuffled = X_train[indices]
        y_train_shuffled = y_train[indices]

        epoch_loss = 0.0

        for i in range(num_batches):
            start = i * batch_size
            end = start + batch_size
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]

            y_batch_onehot = onehot(y_batch, num_classes)

            y_pred, cache = model.forward(X_batch)
            loss = cross_entropy(y_pred, y_batch_onehot)
            epoch_loss += loss

            grads = model.backward(cache, y_batch_onehot)
            model.update(grads, lr=lr, weight_decay=weight_decay)

        
        train_pred, _ = model.forward(X_train[:5000])
        train_acc = accuracy(train_pred, y_train[:5000])

        test_pred, _ = model.forward(X_test)
        test_acc = accuracy(test_pred, y_test)

        print(f"Epoch {epoch:2d} | loss={epoch_loss/num_batches:.4f} | "
              f"train_acc={train_acc*100:.2f}% | test_acc={test_acc*100:.2f}%")

    return model


In [46]:
if __name__ == "__main__":
    model = train_mnist(
        num_epochs=10,
        batch_size=128,
        lr=0.1,
        weight_decay=1e-4
    )

Epoch  1 | loss=2.1786 | train_acc=57.40% | test_acc=57.33%
Epoch  2 | loss=0.7483 | train_acc=83.24% | test_acc=82.56%
Epoch  3 | loss=0.4296 | train_acc=90.60% | test_acc=90.48%
Epoch  4 | loss=0.3090 | train_acc=92.28% | test_acc=91.75%
Epoch  5 | loss=0.2409 | train_acc=94.36% | test_acc=93.87%
Epoch  6 | loss=0.1958 | train_acc=95.02% | test_acc=94.67%
Epoch  7 | loss=0.1648 | train_acc=95.34% | test_acc=94.92%
Epoch  8 | loss=0.1429 | train_acc=96.50% | test_acc=95.77%
Epoch  9 | loss=0.1253 | train_acc=96.38% | test_acc=95.75%
Epoch 10 | loss=0.1102 | train_acc=96.92% | test_acc=96.12%
