# Numpy implementation

In [19]:
import numpy as np

# ----- utils -----
def one_hot(y, C):
    Y = np.zeros((y.size, C))
    Y[np.arange(y.size), y] = 1
    return Y

def softmax(logits):
    z = logits - np.max(logits, axis=1, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=1, keepdims=True)

def relu(x): return np.maximum(0, x)
def relu_grad(x): return (x > 0).astype(x.dtype)

def ce_loss(probs, Y_true_onehot):
    # mean NLL
    eps = 1e-12
    return -np.mean(np.sum(Y_true_onehot * np.log(probs + eps), axis=1))

In [20]:
# ----- tiny dataset (2 Gaussians) -----
def make_blobs(n_per_class=200, dim=2, offset=2.0, seed=0):

    rng = np.random.default_rng(seed)
    c0 = rng.normal(loc=-offset, scale=1.0, size=(n_per_class, dim))
    c1 = rng.normal(loc=+offset, scale=1.0, size=(n_per_class, dim))
    X = np.vstack([c0, c1]).astype(np.float32)
    y = np.array([0]*n_per_class + [1]*n_per_class, dtype=np.int64)

    # shuffle
    idx = rng.permutation(len(X))
    return X[idx], y[idx]

In [21]:
# ----- mini-batch iterator -----
def iterate_minibatches(X, y, batch_size, shuffle=True, rng=None):
    N = len(X)
    idx = np.arange(N)

    if shuffle:
        rng = np.random.default_rng() if rng is None else rng
        rng.shuffle(idx)

    for start in range(0, N, batch_size):
        sl = idx[start:start+batch_size]
        yield X[sl], y[sl]

In [22]:
# ----- model -----
class TwoLayerMLP:
    def __init__(self, D_in, H, D_out, seed=0):
        rng = np.random.default_rng(seed)
        self.W1 = rng.normal(0, 0.02, size=(D_in, H)).astype(np.float32)
        self.b1 = np.zeros(H, dtype=np.float32)
        self.W2 = rng.normal(0, 0.02, size=(H, D_out)).astype(np.float32)
        self.b2 = np.zeros(D_out, dtype=np.float32)

    def forward(self, X):
        # cache for backprop
        self.X = X                                  # (B, D_in)
        self.z1 = X @ self.W1 + self.b1             # (B, H)
        self.h  = relu(self.z1)                     # (B, H)
        self.logits = self.h @ self.W2 + self.b2    # (B, D_out)
        self.probs = softmax(self.logits)           # (B, D_out)
        return self.probs

    def backward(self, Y_onehot):
        B = Y_onehot.shape[0]
        # dL/dlogits = (p - y)/B  (softmax + CE)
        dlogits = (self.probs - Y_onehot) / B                      # (B, C)
        dW2 = self.h.T @ dlogits                                   # (H, C)
        db2 = np.sum(dlogits, axis=0)                              # (C,)
        dh  = dlogits @ self.W2.T                                  # (B, H)
        dz1 = dh * relu_grad(self.z1)                              # (B, H)
        dW1 = self.X.T @ dz1                                       # (D, H)
        db1 = np.sum(dz1, axis=0)                                  # (H,)
        return dW1, db1, dW2, db2

    def step(self, grads, lr=1e-1, weight_decay=0.0):
        dW1, db1, dW2, db2 = grads
        # L2 weight decay (classic)
        if weight_decay != 0.0:
            dW1 += weight_decay * self.W1
            dW2 += weight_decay * self.W2
        self.W1 -= lr * dW1
        self.b1 -= lr * db1
        self.W2 -= lr * dW2
        self.b2 -= lr * db2

    def predict(self, X):
        z1 = relu(X @ self.W1 + self.b1)
        logits = z1 @ self.W2 + self.b2
        return np.argmax(softmax(logits), axis=1)

In [23]:
# ----- train / eval -----
def accuracy(model, X, y):
    return np.mean(model.predict(X) == y)

def train():
    # data
    X, y = make_blobs(n_per_class=400, dim=2, offset=2.0, seed=42)
    N = len(X)
    split = int(0.8 * N)
    Xtr, ytr = X[:split], y[:split]
    Xva, yva = X[split:], y[split:]
    Ytr_oh, Yva_oh = one_hot(ytr, 2), one_hot(yva, 2)

    # model + hyperparams
    model = TwoLayerMLP(D_in=2, H=32, D_out=2, seed=1)
    lr = 0.1
    wd = 1e-4
    batch_size = 64
    epochs = 50

    for ep in range(1, epochs+1):

        # training
        for Xb, yb in iterate_minibatches(Xtr, ytr, batch_size, shuffle=True):
            Yb = one_hot(yb, 2)
            probs = model.forward(Xb)
            grads = model.backward(Yb)
            model.step(grads, lr=lr, weight_decay=wd)

        # eval (no batches needed here)
        probs_tr = model.forward(Xtr)
        loss_tr = ce_loss(probs_tr, Ytr_oh)
        acc_tr = accuracy(model, Xtr, ytr)

        probs_va = model.forward(Xva)
        loss_va = ce_loss(probs_va, Yva_oh)
        acc_va = accuracy(model, Xva, yva)

        if ep % 5 == 0 or ep == 1:
            print(f"epoch {ep:02d} | "
                  f"train loss {loss_tr:.4f} acc {acc_tr:.3f} | "
                  f"val loss {loss_va:.4f} acc {acc_va:.3f}")



In [24]:
train()

epoch 01 | train loss 0.6631 acc 0.995 | val loss 0.6643 acc 0.994
epoch 05 | train loss 0.0736 acc 0.991 | val loss 0.0807 acc 0.988
epoch 10 | train loss 0.0292 acc 0.991 | val loss 0.0343 acc 0.988
epoch 15 | train loss 0.0218 acc 0.991 | val loss 0.0263 acc 0.988
epoch 20 | train loss 0.0189 acc 0.994 | val loss 0.0232 acc 0.988
epoch 25 | train loss 0.0175 acc 0.994 | val loss 0.0216 acc 0.988
epoch 30 | train loss 0.0167 acc 0.994 | val loss 0.0206 acc 0.988
epoch 35 | train loss 0.0161 acc 0.994 | val loss 0.0200 acc 0.988
epoch 40 | train loss 0.0158 acc 0.994 | val loss 0.0196 acc 0.988
epoch 45 | train loss 0.0155 acc 0.994 | val loss 0.0193 acc 0.988
epoch 50 | train loss 0.0153 acc 0.994 | val loss 0.0191 acc 0.988


# PyTorch implementation

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 2-layer MLP (ReLU -> Dropout -> Linear), CE loss, mini-batches

In [26]:
torch.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [27]:
# ---- tiny synthetic data (binary classification) ----
N, Din, Dout = 800, 2, 2

X0 = torch.randn(N//2, Din) - 2.0
X1 = torch.randn(N//2, Din) + 2.0
X = torch.cat([X0, X1], dim=0)

y = torch.cat([torch.zeros(N//2, dtype=torch.long),
               torch.ones(N//2,  dtype=torch.long)], dim=0)

perm = torch.randperm(N);
X, y = X[perm], y[perm]

Xtr, ytr = X[:int(0.8*N)], y[:int(0.8*N)]
Xva, yva = X[int(0.8*N):], y[int(0.8*N):]

In [28]:
class TwoLayerMLP(nn.Module):

    def __init__(self, Din, H, Dout, p=0.3):
        super().__init__()

        # weights as Parameters (so autograd tracks them)
        self.W1 = nn.Parameter(torch.randn(Din, H) * 0.02)
        self.b1 = nn.Parameter(torch.zeros(H))
        self.W2 = nn.Parameter(torch.randn(H, Dout) * 0.02)
        self.b2 = nn.Parameter(torch.zeros(Dout))
        self.p = p


    def forward(self, x):
        h = x @ self.W1 + self.b1
        h = F.relu(h)
        h = F.dropout(h, p=self.p, training=self.training)  # Dropout only in train mode
        logits = h @ self.W2 + self.b2
        return logits

In [29]:
model = TwoLayerMLP(Din=Din, H=32, Dout=Dout, p=0.5).to(device)
opt = torch.optim.SGD(model.parameters(), lr=0.1, weight_decay=1e-4)  # L2 via weight_decay
batch_size, epochs = 64, 30

In [30]:
def iterate_minibatches(X, y, bs):
    for i in range(0, len(X), bs):
        yield X[i:i+bs], y[i:i+bs]

In [31]:
# ---- training loop ----
for ep in range(1, epochs+1):
    model.train()

    for xb, yb in iterate_minibatches(Xtr.to(device), ytr.to(device), batch_size):
        opt.zero_grad()
        logits = model(xb)
        loss = F.cross_entropy(logits, yb)  # softmax+NLL under the hood
        loss.backward()
        opt.step()

    # evaluation
    model.eval()

    with torch.no_grad():
        def eval_split(Xs, ys):
            logits = model(Xs.to(device))
            loss = F.cross_entropy(logits, ys.to(device)).item()
            acc = (logits.argmax(1).cpu() == ys).float().mean().item()
            return loss, acc

        tr_loss, tr_acc = eval_split(Xtr, ytr)
        va_loss, va_acc = eval_split(Xva, yva)

    if ep in {1, 5, 10, 20, 30}:
        print(f"epoch {ep:02d} | train loss {tr_loss:.4f} acc {tr_acc:.3f} | "
              f"val loss {va_loss:.4f} acc {va_acc:.3f}")

epoch 01 | train loss 0.6560 acc 0.983 | val loss 0.6565 acc 1.000
epoch 05 | train loss 0.0813 acc 0.994 | val loss 0.0681 acc 1.000
epoch 10 | train loss 0.0305 acc 0.994 | val loss 0.0162 acc 1.000
epoch 20 | train loss 0.0209 acc 0.994 | val loss 0.0055 acc 1.000
epoch 30 | train loss 0.0188 acc 0.994 | val loss 0.0033 acc 1.000
