# Improving MLP model

In [1]:
import torch
import torch.nn.functional as F
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
names = open('./data/indian_names/indian_names.csv').read().split('\n')
names = [name for name in names if len(name) > 0]
words = []
for n in names:
    words += n.split(' ')
words = [w for w in words if len(w) > 3] 
words[:10]

['barjraj',
 'ramdin',
 'verma',
 'sharat',
 'chandran',
 'birender',
 'mandal',
 'amit',
 'kushal',
 'kasid']

In [3]:
from collections import Counter
k = Counter(words).items()
sorted(Counter(words).items(), key=lambda x: x[1], reverse=True)[:10]

[('kumar', 1779),
 ('singh', 1159),
 ('devi', 968),
 ('kumari', 551),
 ('pooja', 455),
 ('sharma', 431),
 ('jyoti', 249),
 ('deepak', 215),
 ('sunita', 202),
 ('rahul', 188)]

In [4]:
sorted(k, key=lambda x: (x[1], len(x[0])))[:10]

[('abhi', 1),
 ('paal', 1),
 ('axat', 1),
 ('jony', 1),
 ('azaz', 1),
 ('molu', 1),
 ('jang', 1),
 ('naag', 1),
 ('vude', 1),
 ('shsi', 1)]

In [5]:
voc = '.' + ''.join(sorted(set(''.join(words))))
voc_size = len(voc)
voc , voc_size

('.abcdefghijklmnopqrstuvwxyz', 27)

In [6]:
def enc(c):
    return voc.index(c)

def dec(i):
    return voc[i]

def make_dataset(words, block_size = 3):
    X , y = [], []
    for word in words:
        word = '.'*block_size + word + '.'
        for i in range(len(word) - block_size):
            X.append([enc(k) for k in word[i:i+block_size]])
            y.append(enc(word[i+block_size]))
    return torch.tensor(X), torch.tensor(y)

In [7]:
data_folder = './data/indian_names/'

In [8]:
import random

random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# Xtr,  ytr  = make_dataset(words[:n1] , block_size=3)     # 80%
# Xdev, ydev = make_dataset(words[n1:n2] , block_size=3)   # 10%
# Xte,  yte  = make_dataset(words[n2:] , block_size=3)     # 10%

# torch.save(Xtr, data_folder + 'Xtr_bs_3.pt')
# torch.save(ytr, data_folder + 'ytr_bs_3.pt')
# torch.save(Xdev, data_folder + 'Xdev_bs_3.pt')
# torch.save(ydev, data_folder + 'ydev_bs_3.pt')
# torch.save(Xte, data_folder + 'Xte_bs_3.pt')
# torch.save(yte, data_folder + 'yte_bs_3.pt')

In [22]:
Xtr = torch.load( data_folder + 'Xtr_bs_3.pt')
ytr = torch.load( data_folder + 'ytr_bs_3.pt')
Xdev = torch.load( data_folder + 'Xdev_bs_3.pt')
ydev = torch.load( data_folder + 'ydev_bs_3.pt')
Xte = torch.load( data_folder + 'Xte_bs_3.pt')
yte = torch.load( data_folder + 'yte_bs_3.pt')

In [23]:
Xtr.shape, Xdev.shape, Xte.shape

(torch.Size([217230, 3]), torch.Size([27277, 3]), torch.Size([27270, 3]))

## MLP 

In [79]:
class MLP:
    def __init__(self, inputs, dimensions, block_size, hidden, outputs):
        self.C  = torch.randn(inputs, dimensions, requires_grad=True) 
        self.W1 = torch.randn(dimensions * block_size, hidden, requires_grad=True)
        self.b1 = torch.randn(hidden, requires_grad=True)
        self.W2 = torch.randn(hidden, outputs, requires_grad=True)
        self.b2 = torch.randn(outputs, requires_grad=True)
        self.parameters =  [self.C, self.W1, self.b1, self.W2, self.b2]
        self.dimensions = dimensions
        self.block_size = block_size
        self.hidden = hidden
        self.outputs = outputs
        
    def forward(self, X):
        emb = self.C[X]
        h = torch.tanh(emb.view(-1, self.dimensions*self.block_size) @ self.W1 + self.b1)
        logits = h @ self.W2 + self.b2
        return logits
    
    def backward(self,logits, y):
        loss = F.cross_entropy(logits, y)
        loss.backward()
        return loss
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            logits = self.forward(X[mini_batch])
            loss = self.backward(logits,y[mini_batch])

            with torch.no_grad():
                for p in self.parameters:
                    p -= p.grad * alpha
                    p.grad = None
            
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = }, loss = {loss.item():.5f}')
        
        return loss
    
    @torch.no_grad()
    def get_loss(self, X, y):
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        return loss
    
    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])


In [80]:
block_size = 3
dimensions = 10

mlp100 = MLP(
    inputs = voc_size, 
    dimensions = dimensions,
    block_size = block_size,
    hidden = 100,
    outputs = voc_size
)
f'no of parameters : {mlp100.get_params_count()}' 

'no of parameters : 6097'

In [81]:
mlp100.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)

iter = 0, loss = 20.96150
iter = 10000, loss = 2.17068
iter = 20000, loss = 1.69685
iter = 30000, loss = 1.76626
iter = 40000, loss = 1.46695
iter = 50000, loss = 1.51726
iter = 60000, loss = 1.39342
iter = 70000, loss = 1.27919
iter = 80000, loss = 1.52634
iter = 90000, loss = 1.54958
iter = 100000, loss = 1.89278


tensor(1.8928, grad_fn=<NllLossBackward0>)

In [82]:
print(f'train loss = \t\t{mlp100.get_loss(Xtr, ytr).item():.5f}\nvalidation loss = \t{mlp100.get_loss(Xdev, ydev).item():.5f}')

train loss = 		1.61364
validation loss = 	1.64401


In [83]:
mlp100.get_loss(Xte, yte).item()

1.64191734790802

## Problem 1:  
Initial loss is 19.44517, but if every character is uniformly random, the loss should be 1/voc_size = 1/27 = 0.0370370370 

### Cause :
softmax confidently wrong

the softmax function is squashing down slighlty higher values to +1/-1 's making the most of the losses going up, hence higher loss initally and takes unnecessarily long time to bring it down

### Sol: W2 init
decrease the weights by multiplying them with say $\lambda = 0.1$, which may keep most of the values in the range of tanh and keeps the outputs in -1 to 1 and reduce the loss for outler layer weights $W_2$ and $b_2$


In [84]:
class MLP:
    def __init__(self, inputs, dimensions, block_size, hidden, outputs, delta = 0.1):
        self.C  = torch.randn(inputs, dimensions) 
        self.W1 = torch.randn(dimensions * block_size, hidden)
        self.b1 = torch.randn(hidden)
        self.W2 = torch.randn(hidden, outputs) * delta
        self.b2 = torch.randn(outputs) * delta
        self.parameters =  [self.C, self.W1, self.b1, self.W2, self.b2]
        self.dimensions = dimensions
        self.block_size = block_size
        self.hidden = hidden
        self.outputs = outputs
        
        for p in self.parameters:
            p.requires_grad = True
        
    def forward(self, X):
        emb = self.C[X]
        h = torch.tanh(emb.view(-1, self.dimensions*self.block_size) @ self.W1 + self.b1)
        logits = h @ self.W2 + self.b2
        return logits
    
    def backward(self,logits, y):
        loss = F.cross_entropy(logits, y)
        loss.backward()
        return loss
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            logits = self.forward(X[mini_batch])
            loss = self.backward(logits,y[mini_batch])

            with torch.no_grad():
                for p in self.parameters:
                    p.retain_grad()
                    p -= p.grad * alpha
                    p.grad = None
            
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = }, loss = {loss.item():.5f}')
        
        return loss
    
    @torch.no_grad()
    def get_loss(self, X, y):
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        return loss
    
    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])


In [85]:
mlp100 = MLP(
    inputs = voc_size, 
    dimensions = dimensions,
    block_size = block_size,
    hidden = 100,
    outputs = voc_size,
    delta = 0.1
)

In [86]:
mlp100.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)
print(f'train loss = \t\t{mlp100.get_loss(Xtr, ytr).item():.5f}\nvalidation loss = \t{mlp100.get_loss(Xdev, ydev).item():.5f}')

iter = 0, loss = 3.87594
iter = 10000, loss = 1.69253
iter = 20000, loss = 1.73381
iter = 30000, loss = 1.58649
iter = 40000, loss = 1.65464
iter = 50000, loss = 1.51102
iter = 60000, loss = 2.01614
iter = 70000, loss = 1.99297
iter = 80000, loss = 1.76886
iter = 90000, loss = 1.28250
iter = 100000, loss = 1.78882


tensor(1.7888, grad_fn=<NllLossBackward0>)

train loss = 		1.60924
validation loss = 	1.64067


In [87]:
mlp100.get_loss(Xte, yte).item()

1.6404290199279785

### Report
some what solved as loss started at 3.57

## Problem 2:  
saturated tanh

### Cause :
the tanh function is squashing down slighlty higher values to +1/-1 's making the most of the losses going up, hence higher loss initally and takes unnecessarily long time to bring it down

### Sol: W1 init
decrease the weights by multiplying them with $\lambda = 0.1$, which may keep most of the values in the range of tanh and keeps the outputs in -1 to 1 and reduce the loss for $W_1$ and $b_2$


## Problem 3:  
But can we decide or derive an proper number to multiply with rather than guessing ?

### Sol: Kamming init 

multiply weights with :$\frac{gain}{fan mode}$

* for tanh : gain is $\frac{5}{3}$  hence $\frac{\frac{5}{3}}{\sqrt{fan_{in}}} $

* for ReLU : gain is $\sqrt{2}$  hence $\frac{\sqrt{2}}{\sqrt{fan_{in}}} $

In [88]:
class MLP:
    def __init__(self, inputs, dimensions, block_size, hidden, outputs, delta = 0.1):
        kammin_init = (5/3) / ((dimensions * block_size) ** 0.5) 
        self.C  = torch.randn(inputs, dimensions) 
        self.W1 = torch.randn(dimensions * block_size, hidden) * kammin_init
        self.b1 = torch.randn(hidden) * kammin_init
        self.W2 = torch.randn(hidden, outputs) * delta
        self.b2 = torch.randn(outputs) * delta
        self.parameters =  [self.C, self.W1, self.b1, self.W2, self.b2]
        self.dimensions = dimensions
        self.block_size = block_size
        self.hidden = hidden
        self.outputs = outputs
        
        for p in self.parameters:
            p.requires_grad = True
        
    def forward(self, X):
        emb = self.C[X]
        h = torch.tanh(emb.view(-1, self.dimensions*self.block_size) @ self.W1 + self.b1)
        logits = h @ self.W2 + self.b2
        return logits
    
    def backward(self,logits, y):
        loss = F.cross_entropy(logits, y)
        loss.backward()
        return loss
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            logits = self.forward(X[mini_batch])
            loss = self.backward(logits,y[mini_batch])

            with torch.no_grad():
                for p in self.parameters:
                    p.retain_grad()
                    p -= p.grad * alpha
                    p.grad = None
            
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = }, loss = {loss.item():.5f}')
        
        return loss
    
    @torch.no_grad()
    def get_loss(self, X, y):
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        return loss
    
    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])


In [89]:
mlp100 = MLP(
    inputs = voc_size, 
    dimensions = dimensions,
    block_size = block_size,
    hidden = 100,
    outputs = voc_size,
    delta = 0.1
)
mlp100.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)
print(f'train loss = \t\t{mlp100.get_loss(Xtr, ytr).item():.5f}\nvalidation loss = \t{mlp100.get_loss(Xdev, ydev).item():.5f}')

iter = 0, loss = 3.53868
iter = 10000, loss = 1.21541
iter = 20000, loss = 1.51428
iter = 30000, loss = 1.53383
iter = 40000, loss = 1.35891
iter = 50000, loss = 1.83586
iter = 60000, loss = 1.34913
iter = 70000, loss = 1.23212
iter = 80000, loss = 1.03981
iter = 90000, loss = 1.63102
iter = 100000, loss = 1.11431


tensor(1.1143, grad_fn=<NllLossBackward0>)

train loss = 		1.58817
validation loss = 	1.62430


In [90]:
mlp100.get_loss(Xte, yte).item()

1.6205750703811646

### Report
We had a better minimized loss 


## Problem 4:  
should we init these weights **precisely always** ?

No, As Modren innovations doesn't require these



### Sol 4.1: Batch Normalisation 
paper : [arXiv:1502.03167](https://arxiv.org/abs/1502.03167)


In [98]:
class MLP:
    def __init__(self, inputs, dimensions, block_size, hidden, outputs, delta = 0.1):
        self.inputs = inputs
        self.dimensions = dimensions
        self.block_size = block_size
        self.hidden = hidden
        self.outputs = outputs

        self.epsilon = 1e-6

        kammin_init = (5/3) / ((dimensions * block_size) ** 0.5) 

        self.C  = torch.randn(self.inputs, self.dimensions) 
        self.W1 = torch.randn(self.dimensions * self.block_size, hidden) * kammin_init
        # we no need b1 here as it gets canceled in batch normalization and grad is zero
        # also we can use gamma and beta to scale and shift the output of the batch normalization
        # self.b1 = torch.randn(self.hidden) * kammin_init 
        self.W2 = torch.randn(self.hidden, self.outputs) * delta
        self.b2 = torch.randn(self.outputs) * delta
        self.gamma = torch.ones((1,self.hidden))
        self.beta = torch.zeros((1,self.hidden))

        self.parameters =  [
            self.C, 
            self.W1, 
            # self.b1, 
            self.W2, 
            self.b2 , 
            self.gamma, 
            self.beta
        ]
        

        for p in self.parameters:
            p.requires_grad = True
        
    def forward(self, X):
        emb = self.C[X]
        h = emb.view(-1, self.dimensions*self.block_size) @ self.W1 # + self.b1) # batch normalization will cancel this anyway
        
        # batch NORMALIZATION
        h = (h - h.mean(dim=0 , keepdim=True)) / (h.std(dim=0 , keepdim=True) + self.epsilon) 
        
        # scale and shift
        h = h * self.gamma + self.beta
        h = torch.tanh(h)
        logits = h @ self.W2 + self.b2
        return logits
    
    def backward(self,logits, y):
        loss = F.cross_entropy(logits, y)
        loss.backward()
        return loss
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            logits = self.forward(X[mini_batch])
            loss = self.backward(logits,y[mini_batch])

            with torch.no_grad():
                for p in self.parameters:
                    p.retain_grad()
                    p -= p.grad * alpha
                    p.grad = None
            
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = }, loss = {loss.item():.5f}')
        
        return loss
    
    @torch.no_grad()
    def get_loss(self, X, y):
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        return loss
    
    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])


mlp100 = MLP(
    inputs = voc_size, 
    dimensions = dimensions,
    block_size = block_size,
    hidden = 100,
    outputs = voc_size,
    delta = 0.1
)
mlp100.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)
print(f'train loss = \t\t{mlp100.get_loss(Xtr, ytr).item():.5f}\nvalidation loss = \t{mlp100.get_loss(Xdev, ydev).item():.5f}')

iter = 0, loss = 3.39818
iter = 10000, loss = 1.56852
iter = 20000, loss = 1.83559
iter = 30000, loss = 2.00934
iter = 40000, loss = 1.44431
iter = 50000, loss = 1.60379
iter = 60000, loss = 1.83953
iter = 70000, loss = 1.31751
iter = 80000, loss = 2.00082
iter = 90000, loss = 1.45018
iter = 100000, loss = 1.88489


tensor(1.8849, grad_fn=<NllLossBackward0>)

train loss = 		1.59163
validation loss = 	1.61244


In [99]:
mlp100.get_loss(Xte, yte).item()

1.606314778327942


### Sol 4.2 : Batch Norm with buffers
paper : [arXiv:1502.03167](https://arxiv.org/abs/1502.03167)


In [100]:
class MLP:
    def __init__(self, inputs, dimensions, block_size, hidden, outputs, delta = 0.1):
        self.inputs = inputs
        self.dimensions = dimensions
        self.block_size = block_size
        self.hidden = hidden
        self.outputs = outputs

        self.epsilon = 1e-6

        kammin_init = (5/3) / ((dimensions * block_size) ** 0.5) 

        self.C  = torch.randn(self.inputs, self.dimensions) 
        self.W1 = torch.randn(self.dimensions * self.block_size, hidden) * kammin_init
        # we no need b1 here as it gets canceled in batch normalization and grad is zero
        # also we can use bn_mean and bn_std to scale and shift the output of the batch normalization
        # self.b1 = torch.randn(self.hidden) * kammin_init 
        self.W2 = torch.randn(self.hidden, self.outputs) * delta
        self.b2 = torch.randn(self.outputs) * delta
        
        self.bn_mean = torch.ones((1,self.hidden))
        self.bn_std = torch.zeros((1,self.hidden))

        self.parameters =  [
            self.C, 
            self.W1, 
            # self.b1, 
            self.W2, 
            self.b2 , 
            self.bn_mean, 
            self.bn_std
        ]
        
        for p in self.parameters:
            p.requires_grad = True
        
        self.mean_running = torch.zeros((1,self.hidden))
        self.std_running = torch.ones((1,self.hidden))

    def forward(self, X):
        '''linear layer'''
        emb = self.C[X]
        h = emb.view(-1, self.dimensions*self.block_size) @ self.W1 # + self.b1) # batch normalization will cancel this anyway
        
        '''batch NORMALIZATION layer'''
        h_mean = h.mean(dim=0 , keepdim=True)
        h_std = h.std(dim=0 , keepdim=True)
        h = ( (h - h_mean ) / (h_std + self.epsilon) ) * self.bn_mean + self.bn_std
        
        with torch.no_grad():
            retain_rate = 0.999
            self.mean_running = retain_rate * self.mean_running + (1-retain_rate) * h_mean
            self.std_running = retain_rate * self.std_running + (1-retain_rate) * h_std
        
        '''Non linear layer'''
        h = torch.tanh(h)

        logits = h @ self.W2 + self.b2
        return logits
    
    def backward(self,logits, y):
        loss = F.cross_entropy(logits, y)
        loss.backward()
        return loss
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            logits = self.forward(X[mini_batch])
            loss = self.backward(logits,y[mini_batch])

            with torch.no_grad():
                for p in self.parameters:
                    p.retain_grad()
                    p -= p.grad * alpha
                    p.grad = None
            
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = }, loss = {loss.item():.5f}')
        
        return loss
    
    @torch.no_grad()
    def get_loss(self, X, y):
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        return loss
    
    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])


mlp100 = MLP(
    inputs = voc_size, 
    dimensions = dimensions,
    block_size = block_size,
    hidden = 100,
    outputs = voc_size,
    delta = 0.1
)
mlp100.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)
print(f'train loss = \t\t{mlp100.get_loss(Xtr, ytr).item():.5f}\nvalidation loss = \t{mlp100.get_loss(Xdev, ydev).item():.5f}')

iter = 0, loss = 3.48807
iter = 10000, loss = 1.91980
iter = 20000, loss = 1.43032
iter = 30000, loss = 1.91190
iter = 40000, loss = 1.53877
iter = 50000, loss = 1.70825
iter = 60000, loss = 1.47717
iter = 70000, loss = 1.44520
iter = 80000, loss = 1.80084
iter = 90000, loss = 1.96703
iter = 100000, loss = 1.32094


tensor(1.3209, grad_fn=<NllLossBackward0>)

train loss = 		1.59625
validation loss = 	1.61634


In [116]:
mlp100.get_loss(Xte, yte).item() 

1.6165716648101807

### Report
decrease in loss and we have also got mean and std as buffers to compute for a single example rather than a batch for generation 

In [122]:
Xtr[:5]

tensor([[ 0,  0,  0],
        [ 0,  0, 18],
        [ 0, 18,  1],
        [18,  1, 22],
        [ 1, 22,  9]])

In [123]:
mlp100.forward(torch.tensor(Xtr[:5]))

  mlp100.forward(torch.tensor(Xtr[:5]))


tensor([[-1.4881,  2.5852,  0.8258, -0.1989,  1.8182, -2.7224,  0.2606,  0.9202,
          1.2704, -2.2601, -0.6772,  1.3893,  2.3911,  2.2940,  2.1166, -2.3313,
          1.5539, -5.3507,  1.8201,  4.9462, -0.7500, -1.9031,  0.5241, -2.1375,
         -4.6880, -2.0004, -3.2685],
        [-1.1312,  6.3215, -1.1254, -1.6310, -0.5798,  3.8771,  0.3257, -1.1302,
         -3.4537,  4.1822,  1.5650,  0.7608, -2.2792, -0.0647, -2.1940,  4.8553,
          0.1314, -0.3759, -0.8690,  0.2150, -0.5535,  3.7717,  1.1698,  0.0871,
         -3.9299, -1.2475, -4.5861],
        [-2.4576, -0.9401, -0.1559, -0.3427,  2.5042, -3.3280, -2.1423,  0.9801,
          2.6135, -1.6980,  3.2226,  0.7955,  0.4424,  2.7477,  1.6422, -4.2669,
         -1.9138, -4.7681,  0.2883,  1.4695,  0.6522, -1.5704,  2.0444, -0.0095,
         -2.5141, -1.5463, -1.4503],
        [ 6.2657,  3.4087, -0.6471, -1.1638, -0.1980,  5.5320, -4.3471, -1.9057,
          1.2012,  8.4168,  0.9023, -0.5622, -2.5121, -0.7826,  2.4607,  1.8017

# TORCH-IFYING
----

## TORCH.NN.LINEAR

In [10]:
class Linear:
    def __init__(self, fin, fout, bias = True):
        self.weight = torch.randn(fin, fout) / fin**0.5
        self.bias_exist = bias
        self.bias = torch.randn(fout) / fin**0.5 if bias else None 

    def parameters(self):
        return [self.weight, self.bias] if self.bias_exist is not None else [self.weight]
    
    def __call__(self, x):
        self.out  = x @self.weight 
        if self.bias_exist:
            self.out += self.bias
        return self.out

class BatchNormal1D:
    def __init__(self, dim , epsilon = 1e-5, momentum = 0.01 ):
        self.dim = dim 
        self.momentum = momentum
        self.epsilon = epsilon

        self.training = True
        
        self.gamma = torch.ones((1,self.dim))
        self.beta = torch.zeros((1,self.dim))

        self.mean_running = torch.zeros((1,self.dim))
        self.std_running = torch.ones((1,self.dim))

    def __call__(self, x):
        if self.training: 
            xmean = x.mean(dim=0 , keepdim=True)
            xstd = x.std(dim=0 , keepdim=True)
            with torch.no_grad():
                self.mean_running = (1-self.momentum) * self.mean_running + self.momentum * xmean
                self.std_running = (1-self.momentum) * self.std_running + self.momentum * xstd
        else :
            xmean = self.mean_running
            xstd = self.std_running
        
        self.out = ( (x - xmean ) / (xstd + self.epsilon) ) * self.gamma + self.beta
        return self.out
        
    def parameters(self):
        return [self.gamma, self.beta]
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

In [263]:
inputs = voc_size
dims = 10
b_size =3
hidden = 100
outputs = voc_size

C = torch.randn(voc_size, dims) 
layers = [
    Linear(dims*b_size, hidden),    
    # BatchNormal1D(hidden),          
    Tanh(),

    Linear(hidden, hidden),         
    # BatchNormal1D(hidden),          
    Tanh(),

    Linear(hidden, hidden),         
    # BatchNormal1D(hidden),          
    Tanh(),

    Linear(hidden, outputs),
]

with torch.no_grad():
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= (5/3) # all inside (embedding) layers weight kammin init (solution 3)
            if layer.bias is not None:
                layer.bias *= (5/3)

    layers[-1].weight *= 0.1   # last layer weight init (solution 2)

# init weights 
parameters = [C] + [p for layer in layers for p in layer.parameters()]
for p in parameters:
    p.requires_grad = True

In [264]:
sum(p.nelement() for p in parameters)

26297

In [265]:
def sgd( X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
    for iter in range(max_iters+1):
        mini_batch = torch.randint(0, X.shape[0], (batching_size,))

        emb = C[X[mini_batch]]
        h = emb.view(-1, dims*b_size)
        for layer in layers:
            h = layer(h)
        loss = F.cross_entropy(h, y[mini_batch])
        
        for layer in layers:
            layer.out.retain_grad()
        loss.backward()

        with torch.no_grad():
            for p in parameters:
                p.retain_grad()
                p -= p.grad * alpha
                p.grad = None
        
        if iter > max_iters *0.95:
            alpha = alpha / 100
        elif iter > max_iters *0.9:
            alpha = alpha / 10
        
        if verbose and iter % (max_iters/10) == 0:
            print(f'{iter = }, loss = {loss.item():.5f}')
    
    return loss

sgd(Xtr, ytr, alpha = 0.1, max_iters = 10000)

iter = 0, loss = 3.31261
iter = 1000, loss = 2.32022
iter = 2000, loss = 1.96970
iter = 3000, loss = 1.84116
iter = 4000, loss = 2.02868
iter = 5000, loss = 2.27159
iter = 6000, loss = 1.82919
iter = 7000, loss = 1.67293
iter = 8000, loss = 2.09506
iter = 9000, loss = 1.63244
iter = 10000, loss = 1.83713


tensor(1.8371, grad_fn=<NllLossBackward0>)

## BN Class for MLP

In [13]:
class BnMLP:
    def __init__(self,inputs, dims , b_size, hidden, outputs, n_layers):
        self.inputs = inputs
        self.dims = dims
        self.b_size = b_size
        self.hidden = hidden
        self.outputs = outputs
        self.n_layers = n_layers
        
        self.C = torch.randn(self.inputs, self.dims) 
        self.layers = []
        self.layers.append(Linear(self.dims*self.b_size, self.hidden))
        # self.layers.append(BatchNormal1D(self.hidden))
        self.layers.append(Tanh()) 
        for i in range(self.n_layers - 1):
            self.layers.append(Linear(self.hidden, self.hidden))
            # self.layers.append(BatchNormal1D(self.hidden))
            self.layers.append(Tanh()) 
        self.layers.append(Linear(self.hidden, self.outputs))

        with torch.no_grad():
            for layer in self.layers[:-1]:
                if isinstance(layer, Linear):
                    layer.weight *= (5/3) # all inside (embedding) layers weight kammin init (solution 3)
                    if layer.bias is not None:
                        layer.bias *= (5/3)

            self.layers[-1].weight *= 0.1   # last layer weight init (solution 2)

        self.parameters = [self.C] + [p for layer in self.layers for p in layer.parameters()]
        for p in self.parameters:
            p.requires_grad = True
    
    def forward(self, X):
        emb = self.C[X]
        h = emb.view(-1, self.dims*self.b_size)
        for layer in self.layers:
            h = layer(h)
        # h.requires_grad = True
        return h 

    def backward(self, logits, y):
        loss = F.cross_entropy(logits, y)
        # for layer in self.layers:
        #     layer.out.retain_grad()
        loss.backward()
        return loss
    
    # @torch.no_grad()
    def get_loss(self, X, y):
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        return loss
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            # logits = self.forward(X[mini_batch])
            # loss = self.backward(logits,y[mini_batch])
            emb = self.C[X[mini_batch]]
            h = emb.view(-1, self.dims*self.b_size)
            for layer in self.layers:
                h = layer(h)
            loss = F.cross_entropy(h, y[mini_batch])
            for layer in self.layers:
                layer.out.retain_grad()
            loss.backward()
            
            with torch.no_grad():
                for p in self.parameters:
                    p.retain_grad()
                    p -= p.grad * alpha
                    p.grad = None
                
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = }, loss = {loss.item():.5f}')
        
        return loss

    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])
    


In [14]:
bnmlp = BnMLP(
    inputs = voc_size, 
    dims = 10,
    b_size = 3,
    hidden = 100,
    outputs = voc_size,
    n_layers = 2
)

bnmlp.get_params_count()
bnmlp.get_loss(Xtr, ytr).item()
bnmlp.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)

16197

3.3195345401763916

iter = 0, loss = 3.32930
iter = 10000, loss = 1.54642
iter = 20000, loss = 1.28443
iter = 30000, loss = 1.26273
iter = 40000, loss = 1.78174
iter = 50000, loss = 1.47434
iter = 60000, loss = 1.47202
iter = 70000, loss = 0.88945
iter = 80000, loss = 1.27649
iter = 90000, loss = 1.24785
iter = 100000, loss = 1.63052


tensor(1.6305, grad_fn=<NllLossBackward0>)

In [15]:
bnmlp.get_loss(Xtr, ytr).item()
bnmlp.get_loss(Xdev, ydev).item()
bnmlp.get_loss(Xte, yte).item()

1.5304666757583618

1.571450114250183

1.576159119606018

In [17]:
def generate(model, start = '', max_len = 50):
    word = '.'*model.b_size + start
    for i in range(max_len):
        x = [enc(k) for k in word[-model.b_size:]]
        x = torch.tensor([x])
        logits = model.forward(x)
        p = F.softmax(logits, dim=1)
        word += dec(torch.multinomial(p[0], 1).item())
        if word[-1] == '.':
            break
    return word[model.b_size : -1]

generate(bnmlp)

'nuatri'

In [18]:
for i in range(10):
    print(generate(bnmlp))

azurav
manil
shad
shya
jyoti
ajay
sima
ahmeena
shu
deep


In [19]:
for i in range(10):
    print(generate(bnmlp))

surav
shandania
ala
anjal
jan
budhardeepike
kumar
aaram
shakunti
taush


In [20]:
for i in range(10):
    print(generate(bnmlp))

abhaa
prya
sonika
radav
vipinku
ari
chansrat
shi
suraju
deep


## MLP with block size = 4

In [25]:
import random

random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# Xtr,  ytr  = make_dataset(words[:n1] , block_size=4)     # 80%
# Xdev, ydev = make_dataset(words[n1:n2] , block_size=4)   # 10%
# Xte,  yte  = make_dataset(words[n2:] , block_size=4)     # 10%

# torch.save(Xtr, data_folder + 'Xtr_bs_4.pt')
# torch.save(ytr, data_folder + 'ytr_bs_4.pt')
# torch.save(Xdev, data_folder + 'Xdev_bs_4.pt')
# torch.save(ydev, data_folder + 'ydev_bs_4.pt')
# torch.save(Xte, data_folder + 'Xte_bs_4.pt')
# torch.save(yte, data_folder + 'yte_bs_4.pt')

(torch.Size([217526, 4]), torch.Size([27115, 4]), torch.Size([27136, 4]))

In [26]:
Xtr = torch.load( data_folder + 'Xtr_bs_4.pt')
ytr = torch.load( data_folder + 'ytr_bs_4.pt')
Xdev = torch.load( data_folder + 'Xdev_bs_4.pt')
ydev = torch.load( data_folder + 'ydev_bs_4.pt')
Xte = torch.load( data_folder + 'Xte_bs_4.pt')
yte = torch.load( data_folder + 'yte_bs_4.pt')

In [27]:
Xtr.shape, Xdev.shape, Xte.shape

(torch.Size([217526, 4]), torch.Size([27115, 4]), torch.Size([27136, 4]))

In [28]:
bnmlp = BnMLP(
    inputs = voc_size, 
    dims = 10,
    b_size = 4,
    hidden = 100,
    outputs = voc_size,
    n_layers = 2
)

bnmlp.get_params_count()


17197

In [29]:
bnmlp.get_loss(Xtr, ytr).item()


3.2590951919555664

In [30]:
bnmlp.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)

iter = 0, loss = 3.31006
iter = 10000, loss = 1.24934
iter = 20000, loss = 1.14485
iter = 30000, loss = 1.46663
iter = 40000, loss = 1.70179
iter = 50000, loss = 1.58840
iter = 60000, loss = 1.47759
iter = 70000, loss = 1.26664
iter = 80000, loss = 1.60378
iter = 90000, loss = 1.56357
iter = 100000, loss = 1.40011


tensor(1.4001, grad_fn=<NllLossBackward0>)

In [31]:
bnmlp.get_loss(Xtr, ytr).item()
bnmlp.get_loss(Xdev, ydev).item()
bnmlp.get_loss(Xte, yte).item()

1.394792079925537

1.4625132083892822

1.4424569606781006

In [32]:
for i in range(10):
    print(generate(bnmlp))

kumari
pooja
kanchal
irti
tinkar
sureshma
sharma
dikshankaj
nitipal
ashok


In [33]:
for i in range(10):
    print(generate(bnmlp))

singh
sunder
sahni
radhan
deep
tanve
rout
kumarkash
devi
aashan


In [34]:
for i in range(10):
    print(generate(bnmlp))

kumar
pandeepak
salma
deepak
dibashida
kanchika
sunita
bharamjeet
ajay
deen


In [35]:
bnmlp2 = BnMLP(
    inputs = voc_size, 
    dims = 10,
    b_size = 4,
    hidden = 100,
    outputs = voc_size,
    n_layers = 3
)

bnmlp2.get_params_count()

27297

In [36]:
bnmlp2.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)

iter = 0, loss = 3.34225
iter = 10000, loss = 1.76189
iter = 20000, loss = 1.30926
iter = 30000, loss = 1.53168
iter = 40000, loss = 1.75249
iter = 50000, loss = 1.29392
iter = 60000, loss = 1.07405
iter = 70000, loss = 1.58285
iter = 80000, loss = 1.69981
iter = 90000, loss = 1.49916
iter = 100000, loss = 1.44929


tensor(1.4493, grad_fn=<NllLossBackward0>)

In [37]:
bnmlp2.get_loss(Xtr, ytr).item()
bnmlp2.get_loss(Xdev, ydev).item()
bnmlp2.get_loss(Xte, yte).item()

1.3560278415679932

1.4276996850967407

1.42019522190094

In [38]:
for i in range(10):
    print(generate(bnmlp2))

deeeender
singh
kumar
kumar
resailasha
dhary
kalurvna
amit
mavaoo
kavita


In [39]:
for i in range(10):
    print(generate(bnmlp2))

pinki
simran
sunita
tosh
swagtiktru
varshadiya
arora
saima
ravinashok
savita


In [40]:
for i in range(10):
    print(generate(bnmlp2))

sibharti
sartka
kumar
devi
singh
kallu
pinki
chandey
sureshrrer
deepali


In [49]:
len(set(names))
wasted_trails = 0
for i in range(5):
    gens = generate(bnmlp2)
    while gens in names:
        wasted_trails += 1
        gens = generate(bnmlp2)
    print(gens)

13139

gayam
ravind
taramnesh
rajputida
kuldeepa
