### Target: 30 mins | Completed: 90 mins

In [1]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch

In [2]:
with open("data.txt", 'r') as f:
    word = f.read()
words = word.split()

In [3]:
vocab = list(sorted(set("".join(words))))

In [4]:
stoi = {}
stoi["."] = 0
for i, x in enumerate(vocab):
    stoi[x] = i + 1

itos = {}
itos[0] = "."
for i, x in enumerate(vocab):
    itos[i+1] = x

In [5]:
words[0]

'emma'

In [6]:
context_length = 3
x, y = [0] * context_length, [0] * context_length
actual_x = []
for w in words[0]:
    w = w + "."
    for ch1, ch2 in zip(w, w[1:]):
        actual_x.append(x[1:] + [ch1])
        y.append(ch2)
actual_x

[[0, 0, 'e'], [0, 0, 'm'], [0, 0, 'm'], [0, 0, 'a']]

In [7]:
# x.shape = [N, num_features]
import torch
embd_size = 3
x = torch.randint(low = 0, high = 26, size = (10, 3))
y_true = torch.randint(low = 1, high = 27, size = (10, ))
embeddings = torch.rand((len(vocab)+1), embd_size)
xenc = embeddings[x]

In [8]:
xenc.shape

torch.Size([10, 3, 3])

In [9]:
class LayerNorm(nn.Module):
    def __init__(self, in_features, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(in_features))
        self.beta = nn.Parameter(torch.zeros(in_features))
    
    def forward(self, x):
        x_norm = (x - x.mean(dim = 0)) / (x.std() + self.eps)
        return x_norm * self.gamma + self.beta

In [10]:
import torch.nn as nn
class LinearLayer(nn.Module):
    def __init__(self, in_features: int, out_features: int):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.weight = nn.Parameter(torch.rand(self.in_features, self.out_features))
        self.bias = nn.Parameter(torch.rand(self.out_features))

    def forward(self, x):
        return x @ self.weight + self.bias

In [11]:
xenc.shape

torch.Size([10, 3, 3])

In [12]:
class MLP(nn.Module):
    def __init__(self, LinearLayer: LinearLayer, LayerNorm: LayerNorm, eps: float, in_features: int, out_features: int, hidden_features: int):
        super().__init__()
        self.in_features = in_features
        self.layer_1 = LinearLayer(in_features, hidden_features)
        self.layer_2 = LinearLayer(hidden_features, hidden_features)
        self.layer_3 = LinearLayer(hidden_features, hidden_features)
        self.output_4 = LinearLayer(hidden_features, out_features)
        self.layer_norm = LayerNorm(in_features = hidden_features, eps = eps)

        self.model = nn.Sequential(
            LinearLayer(in_features, hidden_features),
            LayerNorm(in_features = hidden_features, eps = eps),
            nn.Tanh(),

            LinearLayer(hidden_features, hidden_features),
            LayerNorm(in_features = hidden_features, eps = eps),
            nn.Tanh(),

            LinearLayer(hidden_features, hidden_features),
            LayerNorm(in_features = hidden_features, eps = eps),
            nn.Tanh(),

            LinearLayer(hidden_features, out_features)
        )

    
    def forward(self, x):
        x = x.view(-1, self.in_features)
        logits = self.model(x)
        return logits

In [13]:
model = MLP(LinearLayer = LinearLayer, LayerNorm = LayerNorm, eps = 1e-6, in_features = x.shape[1], out_features = len(vocab) + 1, hidden_features = 10)
logits = model(xenc)

In [14]:
logits.shape

torch.Size([30, 27])

In [15]:
## Completed till here in 30 mins

In [16]:
# ----- Before 30 mins ----
# Completed: LayerNorm, LinearLayer, Network-design and connection of Layers, forward pass

# ----- After 30 mins ----
# Remaining: Loss calculation + Probs + Logits + Training

In [17]:
# Probs
exp_logits = torch.exp(logits)
probs = exp_logits / exp_logits.sum(dim = 1, keepdim = True)

# Loss Calculation
loss = -torch.log(probs[torch.arange(0, len(y_true)), y_true]).sum() / len(y_true)
print(f"Loss Before Training: {loss:.3f}")

Loss Before Training: 3.343


In [18]:
## Optimization
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.01)

In [19]:
epochs = 1000
for epoch in range(epochs):
    logits = model(xenc)

    exp_logits = torch.exp(logits)
    probs = exp_logits / exp_logits.sum(dim = 1, keepdim = True)
    loss = -torch.log(probs[torch.arange(0, len(y_true)), y_true]).sum() / len(y_true)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"Epoch: {epoch} / {epochs} | loss = {loss:.3f}")

Epoch: 0 / 1000 | loss = 3.343
Epoch: 100 / 1000 | loss = 0.437
Epoch: 200 / 1000 | loss = 0.163
Epoch: 300 / 1000 | loss = 0.150
Epoch: 400 / 1000 | loss = 0.146
Epoch: 500 / 1000 | loss = 0.143
Epoch: 600 / 1000 | loss = 0.142
Epoch: 700 / 1000 | loss = 0.141
Epoch: 800 / 1000 | loss = 0.141
Epoch: 900 / 1000 | loss = 0.140


In [None]:
probs[torch.arange(0, len(y_true))]

tensor([7.8806e-06, 9.7376e-06, 9.7376e-06, 2.9362e-06, 4.3183e-06, 1.1187e-05,
        2.6178e-06, 4.3870e-06, 7.2243e-06, 6.4997e-06],
       grad_fn=<IndexBackward0>)