# BN - MLP - Torch_fied 

In [148]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np 
import matplotlib.pyplot as plt

In [152]:
data_folder = 'data/indian_names/'

Xtr = torch.load(data_folder + 'Xtr_bs_3.pt')
ytr = torch.load(data_folder + 'ytr_bs_3.pt')
Xdev = torch.load(data_folder + 'Xdev_bs_3.pt')
ydev = torch.load(data_folder + 'ydev_bs_3.pt')
Xte = torch.load(data_folder + 'Xte_bs_3.pt')
yte = torch.load(data_folder + 'yte_bs_3.pt')

BLOCK_SIZE = 3

Xtr.shape , ytr.shape, Xdev.shape, ydev.shape, Xte.shape, yte.shape

(torch.Size([217230, 3]),
 torch.Size([217230]),
 torch.Size([27277, 3]),
 torch.Size([27277]),
 torch.Size([27270, 3]),
 torch.Size([27270]))

In [153]:
names = open(data_folder + 'indian_names.csv').read().split('\n')
words = []
for n in names:
    words += n.split(' ')
words = [w for w in words if len(w) > 3] 
voc = '.' + ''.join(sorted(set(''.join(words))))
VOC_SIZE = len(voc)
voc , VOC_SIZE

('.abcdefghijklmnopqrstuvwxyz', 27)

In [154]:
def enc(c):
    return voc.index(c)

def dec(i):
    return voc[i]

In [187]:
class Linear:
    def __init__(self, fin, fout, bias = True):
        self.weight = torch.randn(fin, fout) / fin**0.5
        self.bias_exist = bias
        self.bias = torch.randn(fout) / fin**0.5 if bias else None 

    def parameters(self):
        return [self.weight, self.bias] if self.bias_exist is not None else [self.weight]
    
    def __call__(self, x):
        self.out  = x @self.weight 
        if self.bias_exist:
            self.out += self.bias
        return self.out

class BatchNormal1D:
    def __init__(self, dim , epsilon = 1e-5, momentum = 0.01 ):
        self.dim = dim 
        self.momentum = momentum
        self.epsilon = epsilon

        self.training = True
        
        self.gamma = torch.ones((1,self.dim))
        self.beta = torch.zeros((1,self.dim))

        self.mean_running = torch.zeros((1,self.dim))
        self.std_running = torch.ones((1,self.dim))

    def __call__(self, x):
        if self.training: 
            xmean = x.mean(dim=0 , keepdim=True)
            xstd = x.std(dim=0 , keepdim=True)
            with torch.no_grad():
                self.mean_running = (1-self.momentum) * self.mean_running + self.momentum * xmean
                self.std_running = (1-self.momentum) * self.std_running + self.momentum * xstd
        else :
            xmean = self.mean_running
            xstd = self.std_running
        
        self.out = ( (x - xmean ) / (xstd + self.epsilon) ) * self.gamma + self.beta
        return self.out
        
    def parameters(self):
        return [self.gamma, self.beta]
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []
    
class Embedding:
    def __init__(self, num_emb, dim_emb):
        self.weight = torch.randn(num_emb, dim_emb)
        
    def __call__(self, x):
        self.out = self.weight[x]
        return self.out
    
    def parameters(self):
        return [self.weight]
    
class Flatten:
    def __call__(self, x):
        self.out = x.view(x.shape[0], -1)
        return self.out
    
    def parameters(self):
        return []
    
class Sequential:
    def __init__(self, layers):
        self.layers = layers
        
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def add_layers(self, layers):
        self.layers += layers
    
    def parameters(self):
        params = []
        for layer in self.layers:
            params += layer.parameters()
        return params

In [175]:
class BnMLP:
    def __init__(self,inputs, dims , b_size, hidden, outputs, n_layers):
        self.inputs = inputs
        self.dims = dims
        self.b_size = b_size
        self.hidden = hidden
        self.outputs = outputs
        self.n_layers = n_layers
        self.iterations_trained = 0
        self.current_train_loss = float('inf')
        
        self.model = Sequential([
            Embedding(self.inputs, self.dims),
            Flatten(),
            Linear(self.dims*self.b_size, self.hidden),
            BatchNormal1D(self.hidden),
            Tanh(),
        ])
        for i in range(self.n_layers - 1):
            self.model.add_layers(
                [
                    Linear(self.hidden, self.hidden),
                    BatchNormal1D(self.hidden),
                    Tanh(),
                ]
            )
        self.model.add_layers([Linear(self.hidden, self.outputs)])

        with torch.no_grad():
            for layer in self.model.layers[:-1]:
                if isinstance(layer, Linear):
                    layer.weight *= (5/3) # all inside (embedding) layers weight kammin init (solution 3)
                    if layer.bias is not None:
                        layer.bias *= (5/3)

            self.model.layers[-1].weight *= 0.1   # last layer weight init (solution 2)

        self.parameters = self.model.parameters()
        for p in self.parameters:
            p.requires_grad = True
    
    def forward(self, X):
        logits = self.model(X)
        return logits

    def backward(self, logits, y):
        loss = F.cross_entropy(logits, y)
        loss.backward()
        return loss
    
    @torch.no_grad()
    def evaluate(self, X, y):
        for layer in self.model.layers:
            layer.training = False
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        for layer in self.model.layers:
            layer.training = True
        return loss.item()
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            logits = self.forward(X[mini_batch])
            loss = self.backward(logits,y[mini_batch])
            
            with torch.no_grad():
                for p in self.parameters:
                    p.retain_grad()
                    p -= p.grad * alpha
                    p.grad = None
                
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = :7}, loss = {loss.item():.5f}')
            
            # break 
        
        self.iterations_trained += max_iters
        self.current_train_loss = self.evaluate(X, y)
        return loss

    @torch.no_grad()
    def generate(self, start = '', max_len = 50):
        for layer in self.model.layers:
            layer.training = False
        word = '.'* self.b_size + start

        context = [0] * self.b_size + [enc(c) for c in start]
        for i in range(max_len):
            logits = self.model(torch.tensor([context]))
            p = F.softmax(logits, dim=1)
            word += dec(torch.multinomial(p[0], 1).item())
            if word[-1] == '.':
                break
            context = context[1:] + [enc(word[-1])]
        for layer in self.model.layers:
            layer.training = True
        return word[ self.b_size : -1]
    
    @torch.no_grad()
    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])
    
    def __repr__(self):
        return f'BnMLP(Parm= {self.get_params_count()}, trained iter = {self.iterations_trained} , train loss = {self.current_train_loss:.5f})'


In [156]:
bnmlp = BnMLP( 
    inputs = VOC_SIZE, 
    dims = 10,
    b_size = BLOCK_SIZE,
    hidden = 100,
    outputs = VOC_SIZE,
    n_layers = 2
)

bnmlp.get_params_count()

16597

In [157]:
bnmlp.evaluate(Xtr, ytr)

3.299921989440918

In [162]:
bnmlp.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000) 

iter =       0, loss = 2.20014
iter =   10000, loss = 1.53865
iter =   20000, loss = 1.59183
iter =   30000, loss = 1.34117
iter =   40000, loss = 1.19861
iter =   50000, loss = 1.60139
iter =   60000, loss = 1.85033
iter =   70000, loss = 1.51381
iter =   80000, loss = 1.50735
iter =   90000, loss = 1.32940
iter =  100000, loss = 1.66106


tensor(1.6611, grad_fn=<NllLossBackward0>)

In [163]:
bnmlp
bnmlp.evaluate(Xdev, ydev)

BnMLP(Parm= 16597, trained iter = 101000 , train loss = 1.66106)

1.6003475189208984

In [164]:
for i in range(10):
    gens = bnmlp.generate('')
    while gens in words:
        gens = bnmlp.generate('')
    print(gens)

anjarayanku
zad
sagat
kumar
kumari
kumar
sundeeparshahampal
devi
kumar
shahul


In [165]:
bnmlp.evaluate(Xte, yte)

1.5996315479278564

In [166]:
def make_dataset(words, block_size = 3):
    X , y = [], []
    for word in words:
        word = '.'*block_size + word + '.'
        for i in range(len(word) - block_size):
            X.append([enc(k) for k in word[i:i+block_size]])
            y.append(enc(word[i+block_size]))
    return torch.tensor(X), torch.tensor(y)

import random

random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# Xtr,  ytr  = make_dataset(words[:n1] , block_size=8)     # 80%
# Xdev, ydev = make_dataset(words[n1:n2] , block_size=8)   # 10%
# Xte,  yte  = make_dataset(words[n2:] , block_size=8)     # 10%

# torch.save(Xtr, data_folder + 'Xtr_bs_8.pt')
# torch.save(ytr, data_folder + 'ytr_bs_8.pt')
# torch.save(Xdev, data_folder + 'Xdev_bs_8.pt')
# torch.save(ydev, data_folder + 'ydev_bs_8.pt')
# torch.save(Xte, data_folder + 'Xte_bs_8.pt')
# torch.save(yte, data_folder + 'yte_bs_8.pt')

Xtr = torch.load( data_folder + 'Xtr_bs_8.pt')
ytr = torch.load( data_folder + 'ytr_bs_8.pt')
Xdev = torch.load( data_folder + 'Xdev_bs_8.pt')
ydev = torch.load( data_folder + 'ydev_bs_8.pt')
Xte = torch.load( data_folder + 'Xte_bs_8.pt')
yte = torch.load( data_folder + 'yte_bs_8.pt')

BLOCK_SIZE = 8

In [167]:
bnmlp = BnMLP( 
    inputs = VOC_SIZE, 
    dims = 10,
    b_size = BLOCK_SIZE,
    hidden = 100,
    outputs = VOC_SIZE,
    n_layers = 2
)

bnmlp.get_params_count()

21597

In [168]:
bnmlp.evaluate(Xtr, ytr)

3.3852407932281494

In [169]:
bnmlp.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000) 

iter =       0, loss = 3.39185
iter =   10000, loss = 1.43268
iter =   20000, loss = 1.58156
iter =   30000, loss = 0.92972
iter =   40000, loss = 1.63019
iter =   50000, loss = 0.89624
iter =   60000, loss = 1.75053
iter =   70000, loss = 1.68838
iter =   80000, loss = 1.19775
iter =   90000, loss = 1.09068
iter =  100000, loss = 1.43582


tensor(1.4358, grad_fn=<NllLossBackward0>)

In [170]:
bnmlp
bnmlp.evaluate(Xdev, ydev)

BnMLP(Parm= 21597, trained iter = 100000 , train loss = 1.43582)

1.3510346412658691

1.4010320901870728

In [174]:
for i in range(10):
    gens = bnmlp.generate('')
    while gens in words:
        gens = bnmlp.generate('')
    print(gens)

bahata
deepal
haishat
parimat
halid
sugayal
maldis
gari
upandeep
kirtik


In [172]:
bnmlp.evaluate(Xte, yte)

1.4277682304382324

In [176]:
bnmlp3 = BnMLP( 
    inputs = VOC_SIZE, 
    dims = 10,
    b_size = BLOCK_SIZE,
    hidden = 100,
    outputs = VOC_SIZE,
    n_layers = 3
)

bnmlp3.get_params_count()
bnmlp3.evaluate(Xtr, ytr)
bnmlp3.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000) 
bnmlp3
bnmlp3.evaluate(Xdev, ydev)

31897

3.323640823364258

iter =       0, loss = 3.28724
iter =   10000, loss = 1.76120
iter =   20000, loss = 1.22678
iter =   30000, loss = 1.13796
iter =   40000, loss = 1.72559
iter =   50000, loss = 1.29263
iter =   60000, loss = 1.38998
iter =   70000, loss = 1.06027
iter =   80000, loss = 1.01515
iter =   90000, loss = 1.55126
iter =  100000, loss = 1.44673


tensor(1.4467, grad_fn=<NllLossBackward0>)

BnMLP(Parm= 31897, trained iter = 100000 , train loss = 1.32197)

1.3920589685440063

In [177]:
for i in range(10):
    gens = bnmlp3.generate('')
    while gens in words:
        gens = bnmlp3.generate('')
    print(gens)

nikkika
bhatham
sand
niik
amat
binti
yogke
narsijal
devy
vens


In [178]:
bnmlp3.evaluate(Xte, yte)

1.4093538522720337

# Wave Net

wave net  -> dilated convolution layers

In [230]:

class BatchNormal2D:
    def __init__(self, dim , epsilon = 1e-5, momentum = 0.01 ):
        self.dim = dim 
        self.momentum = momentum
        self.epsilon = epsilon

        self.training = True
        
        self.gamma = torch.ones((1,self.dim))
        self.beta = torch.zeros((1,self.dim))

        self.mean_running = torch.zeros((1,self.dim))
        self.std_running = torch.ones((1,self.dim))

    def __call__(self, x):
        if self.training: 
            xmean = x.mean(dim=(0 ,1 ), keepdim=True)
            xstd = x.std(dim=(0 ,1 ), keepdim=True)
            with torch.no_grad():
                self.mean_running = (1-self.momentum) * self.mean_running + self.momentum * xmean
                self.std_running = (1-self.momentum) * self.std_running + self.momentum * xstd
        else :
            xmean = self.mean_running
            xstd = self.std_running
        
        self.out = ( (x - xmean ) / (xstd + self.epsilon) ) * self.gamma + self.beta
        return self.out
        
    def parameters(self):
        return [self.gamma, self.beta]

class FlattenCons:
    def __init__(self,n):
        self.n = n

    def __call__(self, x):
        B, T, C = x.shape 
        self.out = x.view(B, T//self.n, C*self.n)
        if self.out.shape[1] == 1:
            self.out = self.out.squeeze(1) 
        return self.out 
    
    def parameters(self):
        return []

In [231]:
class WaveNet:
    def __init__(self,inputs, dims , b_size, hidden, outputs, n_layers):
        self.inputs = inputs
        self.dims = dims
        self.b_size = b_size
        self.hidden = hidden
        self.outputs = outputs
        self.n_layers = n_layers
        self.iterations_trained = 0
        self.current_train_loss = float('inf')
        
        self.model = Sequential([
            Embedding(self.inputs, self.dims),
            FlattenCons(2),
            Linear(self.dims*2, self.hidden),
            BatchNormal2D(self.hidden),
            Tanh(),
        ])
        for i in range(self.n_layers - 1):
            self.model.add_layers(
                [
                    FlattenCons(2),
                    Linear(self.hidden * 2 , self.hidden),
                    BatchNormal2D(self.hidden),
                    Tanh(),
                ]
            )
        self.model.add_layers([Linear(self.hidden, self.outputs)])

        with torch.no_grad():
            for layer in self.model.layers[:-1]:
                if isinstance(layer, Linear):
                    layer.weight *= (5/3) # all inside (embedding) layers weight kammin init (solution 3)
                    if layer.bias is not None:
                        layer.bias *= (5/3)

            self.model.layers[-1].weight *= 0.1   # last layer weight init (solution 2)

        self.parameters = self.model.parameters()
        for p in self.parameters:
            p.requires_grad = True
    
    def forward(self, X):
        logits = self.model(X)
        return logits

    def backward(self, logits, y):
        loss = F.cross_entropy(logits, y)
        loss.backward()
        return loss
    
    @torch.no_grad()
    def evaluate(self, X, y):
        for layer in self.model.layers:
            layer.training = False
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        for layer in self.model.layers:
            layer.training = True
        return loss.item()
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            logits = self.forward(X[mini_batch])
            loss = self.backward(logits,y[mini_batch])
            
            with torch.no_grad():
                for p in self.parameters:
                    p.retain_grad()
                    p -= p.grad * alpha
                    p.grad = None
                
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = :7}, loss = {loss.item():.5f}')
            
            # break 
        
        self.iterations_trained += max_iters
        self.current_train_loss = self.evaluate(X, y)
        return loss

    @torch.no_grad()
    def generate(self, start = '', max_len = 50):
        for layer in self.model.layers:
            layer.training = False
        word = '.'* self.b_size + start

        context = [0] * self.b_size + [enc(c) for c in start]
        for i in range(max_len):
            logits = self.model(torch.tensor([context]))
            p = F.softmax(logits, dim=1)
            word += dec(torch.multinomial(p[0], 1).item())
            if word[-1] == '.':
                break
            context = context[1:] + [enc(word[-1])]
        for layer in self.model.layers:
            layer.training = True
        return word[ self.b_size : -1]
    
    @torch.no_grad()
    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])
    
    def __repr__(self):
        return f'WaveNet(Parm= {self.get_params_count()}, trained iter = {self.iterations_trained} , train loss = {self.current_train_loss:.5f})'


In [222]:
wn = WaveNet( 
    inputs = VOC_SIZE, 
    dims = 10,
    b_size = BLOCK_SIZE,
    hidden = 64,
    outputs = VOC_SIZE,
    n_layers = 3
)

In [223]:
wn.get_params_count()

20265

In [224]:
logits = wn.model(Xtr[:4])
logits.shape

torch.Size([4, 27])

In [225]:
for layer in wn.model.layers:
    print(f'{layer.__class__.__name__:20}:{layer.out.shape}')
        

Embedding           :torch.Size([4, 8, 10])
FlattenCons         :torch.Size([4, 4, 20])
Linear              :torch.Size([4, 4, 64])
BatchNormal1D       :torch.Size([4, 4, 64])
Tanh                :torch.Size([4, 4, 64])
FlattenCons         :torch.Size([4, 2, 128])
Linear              :torch.Size([4, 2, 64])
BatchNormal1D       :torch.Size([4, 2, 64])
Tanh                :torch.Size([4, 2, 64])
FlattenCons         :torch.Size([4, 128])
Linear              :torch.Size([4, 64])
BatchNormal1D       :torch.Size([4, 64])
Tanh                :torch.Size([4, 64])
Linear              :torch.Size([4, 27])


In [227]:
wn.evaluate(Xtr, ytr)
wn.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)
wn

3.277979612350464

iter =       0, loss = 3.27792
iter =   10000, loss = 1.79500
iter =   20000, loss = 2.11098
iter =   30000, loss = 2.54206
iter =   40000, loss = 2.24428
iter =   50000, loss = 2.92413
iter =   60000, loss = 2.20618
iter =   70000, loss = 2.60189
iter =   80000, loss = 2.26698
iter =   90000, loss = 2.13544
iter =  100000, loss = 2.41999


tensor(2.4200, grad_fn=<NllLossBackward0>)

WaveNet(Parm= 20265, trained iter = 100001 , train loss = 2.45198)

In [228]:
wn.evaluate(Xdev, ydev)

for i in range(10):
    gens = wn.generate('')
    while gens in words:
        gens = wn.generate('')
    print(gens)

2.4466404914855957

kiryn
sumth
aartal
kulah
nemuh
tomusss
aaiyi
bnypidi
samga
kagga


In [229]:
wn.evaluate(Xte, yte)

2.4499893188476562

#### Bug fix : changed `BatchNorm1D` to `BatchNorm2D`

In [236]:
wn = WaveNet( 
    inputs = VOC_SIZE, 
    dims = 10,
    b_size = BLOCK_SIZE,
    hidden = 64,
    outputs = VOC_SIZE,
    n_layers = 3
)
wn.get_params_count()
logits = wn.model(Xtr[:4])
logits.shape
for layer in wn.model.layers:
    print(f'{layer.__class__.__name__:20}:{layer.out.shape}')

20265

torch.Size([4, 27])

Embedding           :torch.Size([4, 8, 10])
FlattenCons         :torch.Size([4, 4, 20])
Linear              :torch.Size([4, 4, 64])
BatchNormal2D       :torch.Size([4, 4, 64])
Tanh                :torch.Size([4, 4, 64])
FlattenCons         :torch.Size([4, 2, 128])
Linear              :torch.Size([4, 2, 64])
BatchNormal2D       :torch.Size([4, 2, 64])
Tanh                :torch.Size([4, 2, 64])
FlattenCons         :torch.Size([4, 128])
Linear              :torch.Size([4, 64])
BatchNormal2D       :torch.Size([4, 64])
Tanh                :torch.Size([4, 64])
Linear              :torch.Size([4, 27])


In [237]:
wn.evaluate(Xtr, ytr)
wn.sgd(Xtr, ytr, alpha = 0.1, max_iters = 100000)
wn

3.3284873962402344

iter =       0, loss = 3.29971
iter =   10000, loss = 1.53806
iter =   20000, loss = 1.54805
iter =   30000, loss = 1.66632
iter =   40000, loss = 1.27079
iter =   50000, loss = 1.31223
iter =   60000, loss = 1.38087
iter =   70000, loss = 1.42312
iter =   80000, loss = 1.35262
iter =   90000, loss = 1.28782
iter =  100000, loss = 1.34296


tensor(1.3430, grad_fn=<NllLossBackward0>)

WaveNet(Parm= 20265, trained iter = 100000 , train loss = 1.34357)

In [238]:
wn.evaluate(Xdev, ydev)

for i in range(10):
    gens = wn.generate('')
    while gens in words:
        gens = wn.generate('')
    print(gens)

1.3983938694000244

neerka
mahesad
hardarn
veerya
bhiloe
bhab
rekhar
heman
rajti
agad


In [239]:
wn.evaluate(Xte, yte)

1.4159326553344727