In [1]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [9]:
# Load all the words from the '.txt' file
words = open('names.txt', mode = 'r', encoding='utf-8').read().splitlines()
words[:10]

# Encoder and Decoder
chars = sorted(list(set(''.join(words))))
stoi = {c:i+1 for i, c in enumerate(chars)}
stoi['.'] = 0
itos = {i:c for c, i in stoi.items()}

# Generate train, test and validation Dataset
def generate_dataset(words, block_size):
    x, y = [], []
    for w in words:
        # print(w)
        context = [0] * block_size
        for ch in w + '.':
            idx = stoi[ch]
            x.append(context)
            y.append(idx)
            # print(f"{''.join([itos[i] for i in context])} --> {itos[idx]}")
            context = context[1:] + [idx]
    x, y = torch.tensor(x), torch.tensor(y)
    return x, y

def get_split(data, train_split: float, test_split: float, val_split: float, block_size: int):
    import random
    random.seed(42)

    if (train_split + test_split + val_split) != 1:
        raise ValueError("All splits must sum to 100% of the data")
    else: 
        random.shuffle(data)
        n1 = int(train_split* len(data))
        n2 = int((train_split + val_split) * len(data))
        x_train, y_train = generate_dataset(data[:n1], block_size)
        x_val, y_val = generate_dataset(data[n1:n2], block_size)
        x_test, y_test = generate_dataset(data[n2:], block_size)

        return x_train, y_train, x_val, y_val, x_test, y_test

x_train, y_train, x_val, y_val, x_test, y_test = get_split(data = words, train_split = 0.8, test_split = 0.1, val_split = 0.1, block_size = 3)

In [10]:
class Linear:
    def __init__(self, in_features, out_features, bias: bool = True):
        self.weight = torch.nn.Parameter(torch.empty(in_features, out_features))
        torch.nn.init.xavier_uniform_(self.weight)
        self.bias = torch.nn.Parameter(torch.zeros(out_features)) if bias else None
    
    def forward(self, x):
        return x @ self.weight + self.bias
    
    def __call__(self, x): 
        return self.forward(x)
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
    def __init__(self, in_features, training: bool = True, momentum = 0.1, eps = 1e-05):
        self.in_features = in_features
        self.training = training
        self.gamma = torch.nn.Parameter(torch.ones(1, self.in_features), requires_grad = True)
        self.beta = torch.nn.Parameter(torch.zeros(1, self.in_features), requires_grad = True)
        self.running_mean = torch.zeros(1, self.in_features)
        self.running_var = torch.ones(1, self.in_features)
        self.momentum = momentum
        self.eps = eps
    
    def forward(self, x):
        if self.training:
            batch_mean = x.mean(dim = 0, keepdim = True)
            batch_var = x.var(dim = 0, keepdim = True, unbiased = False)

        else: 
            batch_mean = self.running_mean 
            batch_var = self.running_var
        
        x_hat = (x - batch_mean) / torch.sqrt(batch_var + self.eps)
        output = x_hat * self.gamma + self.beta

        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var

        return output
    
    def __call__(self, x):
        return self.forward(x)
    
    def parameters(self):
        return [self.gamma, self.beta]


class Tanh:
    def __call__(self, input):
        return torch.tanh(input)
    
    def parameters(self):
        return []

class Embeddings:
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features
        self.weight = torch.nn.Parameter(torch.randn(in_features, out_features))
    
    def forward(self, x):
        emb = self.weight[x]
        emb_cat = emb.view(-1, x.shape[1] * self.out_features)
        return emb_cat

    def __call__(self, x):
        return self.forward(x)
    
    def parameters(self):
        return [self.weight]

class Flatten:
    def forward(self, x):
        output = x.view(x.shape[0], -1)
        return output

    def __call__(self, x):
        return self.forward(x)
    
    def parameters(self):
        return []

class Sequential:
    def __init__(self, layers):
        self.layers = layers
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)

    def __call__(self, x):
        return self.forward(x)

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [11]:
class MLPModel:
    def __init__(self, model):
        self.model = model
        self.layers = self.model.layers
        self.parameters = model.parameters()
        self.n_parameters = sum([p.nelement() for p in self.parameters])
        self.model_type = self.check_model()
        print(f"{self.model_type} registered with Learnable Parameters: {self.n_parameters}")
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def __call__(self, x):
        return self.forward(x)
    
    def generate_names(self, num_names: int = 5, block_size: int = 3):
        print("-" * 40)
        print(f"Generating names from {self.model_type}")
        g = torch.Generator().manual_seed(42)
        for _ in range(num_names):
            out = []
            context = [0] * block_size
            while True:
                for layer in self.layers:
                    if isinstance(layer, BatchNorm1d):
                        layer.training = False
                
                x = torch.tensor([context])
                for layer in self.layers:
                    x = layer(x)
                    
                probs = F.softmax(x, dim = 1)
                idx = torch.multinomial(probs, num_samples=1, generator=g).item()
                context = context[1:] + [idx]
                out.append(idx)
                if idx == 0:
                    break
        
            print(''.join(itos[i] for i in out))
    
    def train_model(self, lr: float = 0.01, epochs: int = 200000):
        print("-" * 40)
        print(f"Training {self.model_type} | Epochs: {200000} | lr: {lr}")
        print("-" * 40)
        
        for i in range(epochs):
            # mini-batch processing
            rand_idx = torch.randint(0, x_train.shape[0], (32,))

            x = x_train[rand_idx]
            for layer in self.layers:
                x = layer(x)

            loss = F.cross_entropy(x, y_train[rand_idx])

            # Backward pass
            for p in self.parameters:
                p.grad = None

            loss.backward()

            for p in self.parameters:
                p.data -= lr * p.grad
            
            if i % 10000 == 0:
                print(f"{i} / {epochs} Loss: {loss}")
            
            # break
    
    def check_model(self):
        self.model_type = "Plain_MLP_Model"
        for layer in self.layers:
            if isinstance(layer, BatchNorm1d):
                self.model_type = "BatchNorm_MLP_Model"
                break
        return self.model_type
    
    # Evaluate the loss on validation test
    def eval_loss(self, split):
        if split == "train":
            x_data, y_data = x_train, y_train
        elif split == "test":
            x_data, y_data = x_test, y_test
        elif split == "val":
            x_data, y_data = x_val, y_val
        else:
            raise ValueError("split must be 'train', 'test', or 'val'")

        x = x_data
        for layer in self.layers:
            if isinstance(layer, BatchNorm1d):
                layer.training = False
                break

        for layer in self.layers:
            x = layer(x)

        loss_val = F.cross_entropy(x, y_data)
        print(f"Loss on {split} split = {loss_val}")
        return loss_val

### Defining Model's architecture (Without batch normalization)

In [5]:
g = torch.Generator().manual_seed(42)
n_embedings = 10
vocab_size = len(chars) + 1
block_size = 3
in_features = n_embedings * block_size
out_features = 200

model_1 = Sequential([
    Embeddings(vocab_size, n_embedings),
    Flatten(),
    Linear(in_features, out_features), Tanh(),
    Linear(out_features, out_features), Tanh(),
    Linear(out_features, out_features), Tanh(),
    Linear(out_features, out_features), Tanh(),
    Linear(out_features, out_features), Tanh(),
    Linear(out_features, out_features), Tanh(),
    Linear(out_features, vocab_size)
])

model_2 = Sequential([
    Embeddings(vocab_size, n_embedings),
    Flatten(),
    Linear(in_features, out_features),  BatchNorm1d(out_features), Tanh(),
    Linear(out_features, out_features), BatchNorm1d(out_features), Tanh(),
    Linear(out_features, out_features), BatchNorm1d(out_features), Tanh(),
    Linear(out_features, out_features), BatchNorm1d(out_features), Tanh(),
    Linear(out_features, out_features), BatchNorm1d(out_features), Tanh(),
    Linear(out_features, out_features), BatchNorm1d(out_features), Tanh(),
    Linear(out_features, vocab_size)
])


# Register the Model for further tracking
model_1 = MLPModel(model = model_1)
model_2 = MLPModel(model = model_2)

Plain_MLP_Model registered with Learnable Parameters: 212897
BatchNorm_MLP_Model registered with Learnable Parameters: 215297


### Evaluating Loss on Test and Validation dataset

In [6]:
# Train the Models
model_1.train_model()
model_2.train_model(lr = 0.05)

----------------------------------------
Training Plain_MLP_Model | Epochs: 200000 | lr: 0.01
----------------------------------------
0 / 200000 Loss: 3.3732125759124756
10000 / 200000 Loss: 2.1990609169006348
20000 / 200000 Loss: 1.8497204780578613
30000 / 200000 Loss: 2.3937573432922363
40000 / 200000 Loss: 2.212120771408081
50000 / 200000 Loss: 2.159064769744873
60000 / 200000 Loss: 1.6576730012893677
70000 / 200000 Loss: 1.8349465131759644
80000 / 200000 Loss: 1.96515953540802
90000 / 200000 Loss: 1.7478300333023071
100000 / 200000 Loss: 2.0020923614501953
110000 / 200000 Loss: 2.100942611694336
120000 / 200000 Loss: 2.4784464836120605
130000 / 200000 Loss: 1.7515544891357422
140000 / 200000 Loss: 2.178866147994995
150000 / 200000 Loss: 2.39397931098938
160000 / 200000 Loss: 2.372723340988159
170000 / 200000 Loss: 1.6358914375305176
180000 / 200000 Loss: 1.974360466003418
190000 / 200000 Loss: 2.4713988304138184
----------------------------------------
Training BatchNorm_MLP_Model

### Generating names from our Trained Model

In [None]:
# generate some words from different Models
model_1.generate_names(block_size)
model_2.generate_names(block_size)

----------------------------------------
Generating names from Plain_MLP_Model
yeosyah.
malin.
dloey.
skylinny.
nicyannachel.
----------------------------------------
Generating names from BatchNorm_MLP_Model
yessy.
theodor.
dece.
khalei.
nya.


In [8]:
# Test and Validation loss checking logic: will implemnet later; because i need to re-train the entire model
print("Plain_MLP_Model")
model_1.eval_loss('train')
model_1.eval_loss('test')
model_1.eval_loss('val')
print("-" * 40)
print("BatchNorm_MLP_Model")
model_2.eval_loss('train')
model_2.eval_loss('test')
model_2.eval_loss('val')

Plain_MLP_Model
Loss on train split = 2.0000603199005127
Loss on test split = 2.0938944816589355
Loss on val split = 2.100276231765747
----------------------------------------
BatchNorm_MLP_Model
Loss on train split = 2.0005698204040527
Loss on test split = 2.103998899459839
Loss on val split = 2.1080944538116455


tensor(2.1081, grad_fn=<NllLossBackward0>)

## **Summary of 3\_MLP\_v4.ipynb**

### **Objective**

To train **deep MLPs** for character-level name generation using a **from-scratch implementation** of neural network layers, explicitly handling parameter initialization, forward passes, backpropagation, and optimization.

### **Implemented Components**

* **Linear layer (custom)**: with Xavier initialization and optional bias.
* **BatchNorm1d (custom)**: with running mean/variance tracking and learnable `γ` (scale) and `β` (shift).
* **Tanh (custom)**: simple wrapper around `torch.tanh`.
* **MLPModel (custom)**:

  * Tracks parameters.
  * Defines `forward`, `train_model`, and `generate_names`.
  * Differentiates automatically between `Plain_MLP_Model` and `BatchNorm_MLP_Model`.

### **Experiment Setup**

* **Embedding table (`C`)** initialized as trainable parameters.
* Two MLP variants trained:

  1. **Plain\_MLP\_Model** → deep stack of Linear + Tanh.
  2. **BatchNorm\_MLP\_Model** → same, but with BatchNorm1d between every Linear and Tanh.
* Both models trained with **mini-batch gradient descent** and tested for **name generation quality**.

---

## **Conclusion**

* `v4` represents the **first full from-scratch deep learning framework** built in the project (Linear, BN, Activation, Loss, Training Loop).
* **Batch Normalization integration** works seamlessly even in this low-level implementation, demonstrating:

  * More stable gradient flow,
  * Better convergence when training with a higher learning rate (`lr = 0.05` vs `0.01` for plain MLP).
* By abstracting layers into composable modules, this notebook sets the stage for:

  * Easier **extension into RNNs, GRUs, Transformers**,
  * Explicit experimentation with **weight initialization, activations, and normalization strategies** at the code level.
* Compared to `v3`, this notebook transitions from **“experimenting with PyTorch MLPs”** → to **“building a neural network framework by hand.”**
