# Tiny Training Loop (Manual Dataset + Mini-Batches)

**Goal:** Implement a minimal end-to-end training loop *from scratch* using NumPy:
- Hand-craft a tiny 2-D binary classification dataset
- Logistic regression model: \\( \hat y = \sigma(XW + b) \\)
- Binary cross-entropy loss (BCE)
- Mini-batch stochastic gradient descent (SGD) with shuffling each epoch
- Print loss/accuracy to verify learning

**Key formulas**

- Sigmoid: \\( \sigma(z) = 1 / (1 + e^{-z}) \\)
- BCE: $\mathcal{L} = -\frac{1}{B}\sum \big[ y \log \hat{y} + (1-y)\log(1-\hat{y}) \big]$
- Gradients (batch of size B):
  - Error: $e = \hat{y} - y$
  - $\nabla_W = \frac{1}{B} X^\top e$
  - $\nabla_b = \frac{1}{B} \sum e$


**Checklist**
- [ ] Build manual dataset
- [ ] Implement forward pass, loss, grads
- [ ] Write a mini-batch iterator (with shuffling)
- [ ] Train and print metrics
- [ ] (Optional) Try different batch sizes / learning rates


In [1]:
import numpy as np
np.random.seed(42)


# 1) Manual toy dataset (2-D)
# -----------------------------
# Two Gaussian blobs
n_per_class = 100
mean0, mean1 = np.array([-1.5, -1.0]), np.array([1.2, 1.3])
cov = np.array([[0.5, 0.0],
                [0.0, 0.5]])

X0 = np.random.multivariate_normal(mean0, cov, size=n_per_class)
X1 = np.random.multivariate_normal(mean1, cov, size=n_per_class)

X = np.vstack([X0, X1])
y = np.concatenate([np.zeros(n_per_class), np.ones(n_per_class)])
y = y.reshape(-1, 1)

n_samples, n_features = X.shape

# Standardize features
X_mean, X_std = X.mean(axis=0, keepdims=True), X.std(axis=0, keepdims=True) + 1e-8
X = (X - X_mean) / X_std


# 2) Model params (logistic regression)
# ---------------------------------------
W = np.random.randn(n_features, 1) * 0.01
b = np.zeros((1,))


In [2]:
# 3) Helper functions
# -----------------------------
def sigmoid(z):
    # Stable sigmoid
    z = np.clip(z, -20, 20)
    return 1.0 / (1.0 + np.exp(-z))

def bce_loss(y_true, y_prob, eps=1e-8):
    y_prob = np.clip(y_prob, eps, 1 - eps)
    return -np.mean(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob))

def accuracy(y_true, y_prob):
    y_pred = (y_prob >= 0.5).astype(np.float32)
    return (y_pred == y_true).mean()

def iter_minibatches(X, y, batch_size, shuffle=True):
    n = X.shape[0]
    idx = np.arange(n)
    if shuffle:
        np.random.shuffle(idx)
    for start in range(0, n, batch_size):
        end = start + batch_size
        batch_idx = idx[start:end]
        yield X[batch_idx], y[batch_idx]

In [3]:
# 4) Training loop (mini-batch SGD)
# ---------------------------------
lr = 0.1
batch_size = 16
epochs = 30

for epoch in range(1, epochs + 1):
    # one epoch of mini-batch SGD
    for Xb, yb in iter_minibatches(X, y, batch_size=batch_size, shuffle=True):
        # Forward
        logits = Xb @ W + b
        yhat = sigmoid(logits)

        # Loss
        loss = bce_loss(yb, yhat)

        # Backprop (gradients)
        # error = dL/dlogits = (yhat - y)
        error = (yhat - yb)
        grad_W = (Xb.T @ error) / Xb.shape[0]
        grad_b = error.mean(axis=0)

        # SGD update
        W -= lr * grad_W
        b -= lr * grad_b

    # end of epoch: report metrics on full data
    with np.errstate(over='ignore'):
        yhat_all = sigmoid(X @ W + b)
    train_loss = bce_loss(y, yhat_all)
    train_acc = accuracy(y, yhat_all)
    if epoch % 5 == 0 or epoch == 1 or epoch == epochs:
        print(f"Epoch {epoch:02d} | loss={train_loss:.4f} | acc={train_acc*100:.1f}%")

# Quick check
print("\nW:", W.ravel(), " b:", float(b))

Epoch 01 | loss=0.3857 | acc=99.5%
Epoch 05 | loss=0.1447 | acc=99.5%
Epoch 10 | loss=0.0905 | acc=99.5%
Epoch 15 | loss=0.0697 | acc=99.5%
Epoch 20 | loss=0.0584 | acc=99.5%
Epoch 25 | loss=0.0512 | acc=99.5%
Epoch 30 | loss=0.0462 | acc=99.5%

W: [2.47450664 2.20518569]  b: -0.02075615803662676


# Tiny Training Loop with PyTorch — Custom `Dataset` + Mini-batches

**Goals**
- Create a manual toy dataset and wrap it in a PyTorch `Dataset`
- Use `DataLoader` for shuffling + mini-batches
- Implement a clean, reusable training loop (works for any model)
- Baseline: Logistic Regression on 2-D points
- (Optional) Tiny Transformer for sequence classification on dummy tokens

**Key Ideas**
- `Dataset.__len__` and `Dataset.__getitem__` make your data iterable.
- `DataLoader(..., batch_size, shuffle=True)` gives mini-batches each epoch.
- Use `model.train()`/`model.eval()` and disable grads during evaluation.
- Keep the loop generic: `train_one_epoch`, `evaluate`, reused across models.

**What to try next**
- Change `batch_size`, `lr`, and `epochs`
- Swap the baseline for the Transformer block and train on toy sequences
- Add early stopping / LR scheduler tomorrow


In [4]:
# PyTorch tiny training loop with a custom Dataset and two model options
import math
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [5]:
# 1) DATASETS
class PointsDataset(Dataset):
    """
    2-D binary classification dataset: two Gaussian blobs (manual data).
    """
    def __init__(self, n_per_class=200, mean0=(-1.5, -1.0), mean1=(1.2, 1.3), cov=0.5, normalize=True):
        self.X, self.y = self._make_blobs(n_per_class, mean0, mean1, cov)
        if normalize:
            m = self.X.mean(axis=0, keepdims=True)
            s = self.X.std(axis=0, keepdims=True) + 1e-8
            self.X = (self.X - m) / s
        self.X = torch.from_numpy(self.X).float()
        self.y = torch.from_numpy(self.y).float()

    def _make_blobs(self, n, mean0, mean1, cov):
        cov_mat = np.array([[cov, 0.0],[0.0, cov]])
        X0 = np.random.multivariate_normal(np.array(mean0), cov_mat, size=n)
        X1 = np.random.multivariate_normal(np.array(mean1), cov_mat, size=n)
        X = np.vstack([X0, X1])
        y = np.concatenate([np.zeros(n), np.ones(n)])
        # quick shuffle for randomness
        idx = np.random.permutation(len(X))
        return X[idx], y[idx]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class ToySequenceDataset(Dataset):
    """
    Dummy sequence classification for a tiny Transformer
    """
    def __init__(self, n_samples=2000, seq_len=16, vocab_size=50):
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        xs, ys = [], []
        for _ in range(n_samples):
            tokens = np.random.randint(1, vocab_size, size=seq_len, dtype=np.int64)
            label = (tokens.sum() % 2).astype(np.int64)
            xs.append(tokens)
            ys.append(label)
        self.X = torch.from_numpy(np.stack(xs))
        self.y = torch.from_numpy(np.array(ys))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [6]:
# 2) MODELS
class LogisticRegressor(nn.Module):
    """Baseline linear classifier for 2-D points (with BCEWithLogitsLoss)."""
    def __init__(self, in_dim=2):
        super().__init__()
        self.linear = nn.Linear(in_dim, 1)

    def forward(self, x):
        # x: (B, 2)
        logits = self.linear(x)
        return logits.squeeze(-1)


class TinyTransformerClassifier(nn.Module):
    """
    Minimal Transformer encoder for sequence classification.
    - Token embedding + positional encoding
    - TransformerEncoder with a couple of layers/heads
    - CLS-style pooling: use the first token’s hidden state for classification
    """
    def __init__(self, vocab_size=50, d_model=64, nhead=4, num_layers=2, dim_ff=128, max_len=256, num_classes=2):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(max_len, d_model)
        enc_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_ff, batch_first=True)
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.cls = nn.Linear(d_model, num_classes)

    def forward(self, tokens):
        # tokens: (B, L) longs
        B, L = tokens.shape
        pos = torch.arange(L, device=tokens.device).unsqueeze(0).expand(B, L)
        x = self.tok_emb(tokens) + self.pos_emb(pos)
        h = self.encoder(x)
        cls_h = h[:, 0, :]
        logits = self.cls(cls_h)
        return logits

In [7]:
# 3) TRAIN / EVAL UTILITIES
def accuracy_from_logits_binary(logits, targets):
    probs = torch.sigmoid(logits)
    preds = (probs >= 0.5).float()
    return (preds == targets).float().mean().item()

def accuracy_from_logits_multiclass(logits, targets):
    preds = logits.argmax(dim=-1)
    return (preds == targets).float().mean().item()

def train_one_epoch(model, loader, criterion, optimizer, device, task="binary"):
    model.train()
    running_loss, running_acc, n_batches = 0.0, 0.0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if task == "binary":
            running_acc += accuracy_from_logits_binary(logits.detach(), yb.detach())
        else:
            running_acc += accuracy_from_logits_multiclass(logits.detach(), yb.detach())
        n_batches += 1
    return running_loss / n_batches, running_acc / n_batches

@torch.no_grad()
def evaluate(model, loader, criterion, device, task="binary"):
    model.eval()
    running_loss, running_acc, n_batches = 0.0, 0.0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        running_loss += loss.item()
        if task == "binary":
            running_acc += accuracy_from_logits_binary(logits, yb)
        else:
            running_acc += accuracy_from_logits_multiclass(logits, yb)
        n_batches += 1
    return running_loss / n_batches, running_acc / n_batches

In [8]:
# 4) EXPERIMENT A — Points + Logistic Regression
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

points_ds = PointsDataset(n_per_class=200, normalize=True)
n_total = len(points_ds)
n_train = int(0.8 * n_total)
n_val = n_total - n_train
train_ds, val_ds = torch.utils.data.random_split(points_ds, [n_train, n_val], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=128, shuffle=False)

model_lr = LogisticRegressor(in_dim=2).to(device)
criterion_bce = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model_lr.parameters(), lr=0.1)

print("\n=== Training: Logistic Regression on 2-D Points ===")
for epoch in range(1, 31):
    tr_loss, tr_acc = train_one_epoch(model_lr, train_loader, criterion_bce, optimizer, device, task="binary")
    va_loss, va_acc = evaluate(model_lr, val_loader, criterion_bce, device, task="binary")
    if epoch in {1, 5, 10, 20, 30}:
        print(f"Epoch {epoch:02d} | train loss {tr_loss:.4f} acc {tr_acc*100:.1f}% | val loss {va_loss:.4f} acc {va_acc*100:.1f}%")



=== Training: Logistic Regression on 2-D Points ===
Epoch 01 | train loss 0.2953 acc 99.4% | val loss 0.2815 acc 98.8%
Epoch 05 | train loss 0.1522 acc 100.0% | val loss 0.1683 acc 98.8%
Epoch 10 | train loss 0.1017 acc 100.0% | val loss 0.1227 acc 98.8%
Epoch 20 | train loss 0.0664 acc 100.0% | val loss 0.0890 acc 98.8%
Epoch 30 | train loss 0.0517 acc 100.0% | val loss 0.0747 acc 98.8%
