<a href="https://colab.research.google.com/github/HuanAII/MLPforMNIST_classification/blob/main/MLPForMNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [None]:
# Using MLP for MNIST Classification

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from torchvision import datasets , transforms
from torch.utils.data import DataLoader

In [None]:
# Load dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) # convert PIL IMAGE -> tensor [0 , 1]
train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)


In [82]:
def to_numpy(dataset):
  X , y = [], []
  for img , label in dataset:
    X.append(img.view(-1).numpy())
    y.append(label)
  return np.array(X), np.array(y)

X_train , y_train = to_numpy(train_data)
X_test  , y_test = to_numpy(test_data)

# **Define Activation + Loss**

In [83]:
def relu (x):
  return np.maximum(0 , x)

def relu_derivative(x):
  return ( x > 0).astype(float)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy(y_pred , y):
  m = y.shape[0]
  log_likelihood = -np.log(y_pred[range(m), y] + 1e-9)
  return np.sum(log_likelihood)/m

# **MLP Class  ( 1 hidden layer )**

In [84]:
class MLP :
  # Define MLP layer
  def __init__(self , input_size , hidden_size , output_size) :
    self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2. / input_size)
    self.b1 = np.zeros((1, hidden_size))

    self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2. / hidden_size)
    self.b2 = np.zeros((1, output_size))

  # Define forward
  def forward(self , X) : # [batchsize , inputsize]
    self.Z1 = X @ self.W1 + self.b1  # [batchsize , inputsize] * [input_size, hidden_size] = [batchsize , hidden_size]
    self.A1 = relu(self.Z1)
    self.Z2 = self.Z1 @ self.W2 + self.b2 #[batchsize , hidden_size] * [hidden_size , output_size ] = [batchsize , outputsize ]

    self.A2 = softmax(self.Z2)

    return self.A2 #[batchsize , outputsize ]

  def backward(self, X, y_true, learning_rate):
      m = X.shape[0]
      y_one_hot = np.zeros_like(self.A2)
      y_one_hot[np.arange(m), y_true] = 1

      dZ2 = self.A2 - y_one_hot
      dW2 = self.A1.T @ dZ2 / m
      db2 = np.sum(dZ2, axis=0, keepdims=True) / m

      dA1 = dZ2 @ self.W2.T
      dZ1 = dA1 * relu_derivative(self.Z1)
      dW1 = X.T @ dZ1 / m
      db1 = np.sum(dZ1, axis=0, keepdims=True) / m

      self.W1 -= learning_rate * dW1
      self.b1 -= learning_rate * db1
      self.W2 -= learning_rate * dW2
      self.b2 -= learning_rate * db2

  def predict(self, X):
      probs = self.forward(X)
      return np.argmax(probs, axis=1)



# **Train MLP**

In [85]:
def train_mlp():
    model = MLP(input_size=784, hidden_size=128, output_size=10)

    epochs = 30
    batch_size = 64
    learning_rate = 0.01

    train_acc_list = []
    test_acc_list = []


    best_acc = 0
    best_epoch = 0
    best_train_acc = 0
    best_model = None

    for epoch in range(epochs):
        # Shuffle
        permutation = np.random.permutation(X_train.shape[0])
        X_train_shuffled = X_train[permutation]
        y_train_shuffled = y_train[permutation]

        for i in range(0, X_train.shape[0], batch_size):
            X_batch = X_train_shuffled[i:i+batch_size]
            y_batch = y_train_shuffled[i:i+batch_size]

            y_pred = model.forward(X_batch)
            loss = cross_entropy(y_pred, y_batch)
            model.backward(X_batch, y_batch, learning_rate)

        # Accuracy
        y_pred_train = model.predict(X_train)
        train_acc = np.mean(y_pred_train == y_train)
        train_acc_list.append(train_acc)

        y_pred_test = model.predict(X_test)
        test_acc = np.mean(y_pred_test == y_test)
        test_acc_list.append(test_acc)

        print(f"Epoch {epoch+1:2d} - LR: {learning_rate:.5f} - Train Acc: {train_acc:.4f} - Test Acc: {test_acc:.4f}")


        if test_acc > best_acc:
            best_acc = test_acc
            best_train_acc = train_acc
            best_epoch = epoch + 1
            best_model = model

    # Vẽ biểu đồ
    plt.plot(range(1, epochs+1), train_acc_list, label='Train Accuracy')
    plt.plot(range(1, epochs+1), test_acc_list, label='Test Accuracy')
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Training vs Testing Accuracy (with LR)")
    plt.legend()
    plt.grid(True)
    plt.show()


    print(f"\n Best Epoch: {best_epoch}")
    print(f" Best Train Accuracy: {best_train_acc:.4f}")
    print(f" Best Test Accuracy:  {best_acc:.4f}")

    return best_model

train_mlp()

Epoch  1 - LR: 0.01000 - Train Acc: 0.8923 - Test Acc: 0.8984
Epoch  2 - LR: 0.01000 - Train Acc: 0.9050 - Test Acc: 0.9057
Epoch  3 - LR: 0.01000 - Train Acc: 0.9099 - Test Acc: 0.9103
Epoch  4 - LR: 0.01000 - Train Acc: 0.9127 - Test Acc: 0.9127
Epoch  5 - LR: 0.01000 - Train Acc: 0.9069 - Test Acc: 0.9062
Epoch  6 - LR: 0.01000 - Train Acc: 0.8751 - Test Acc: 0.8735
Epoch  7 - LR: 0.01000 - Train Acc: 0.8644 - Test Acc: 0.8642


KeyboardInterrupt: 