# WAVE NET

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np 
import matplotlib.pyplot as plt

In [2]:
data_folder = 'data/indian_names/'

Xtr = torch.load(data_folder + 'Xtr_bs_3.pt')
ytr = torch.load(data_folder + 'ytr_bs_3.pt')
Xdev = torch.load(data_folder + 'Xdev_bs_3.pt')
ydev = torch.load(data_folder + 'ydev_bs_3.pt')
Xte = torch.load(data_folder + 'Xte_bs_3.pt')
yte = torch.load(data_folder + 'yte_bs_3.pt')

Xtr.shape , ytr.shape, Xdev.shape, ydev.shape, Xte.shape, yte.shape

(torch.Size([217230, 3]),
 torch.Size([217230]),
 torch.Size([27277, 3]),
 torch.Size([27277]),
 torch.Size([27270, 3]),
 torch.Size([27270]))

In [3]:
names = open(data_folder + 'indian_names.csv').read().split('\n')
words = []
for n in names:
    words += n.split(' ')
words = [w for w in words if len(w) > 3] 
voc = '.' + ''.join(sorted(set(''.join(words))))
voc_size = len(voc)
voc , voc_size

('.abcdefghijklmnopqrstuvwxyz', 27)

In [4]:
def enc(c):
    return voc.index(c)

def dec(i):
    return voc[i]

In [77]:
class Linear:
    def __init__(self, fin, fout, bias = True):
        self.weight = torch.randn(fin, fout) / fin**0.5
        self.bias_exist = bias
        self.bias = torch.randn(fout) / fin**0.5 if bias else None 

    def parameters(self):
        return [self.weight, self.bias] if self.bias_exist is not None else [self.weight]
    
    def __call__(self, x):
        self.out  = x @self.weight 
        if self.bias_exist:
            self.out += self.bias
        return self.out

class BatchNormal1D:
    def __init__(self, dim , epsilon = 1e-5, momentum = 0.01 ):
        self.dim = dim 
        self.momentum = momentum
        self.epsilon = epsilon

        self.training = True
        
        self.gamma = torch.ones((1,self.dim))
        self.beta = torch.zeros((1,self.dim))

        self.mean_running = torch.zeros((1,self.dim))
        self.std_running = torch.ones((1,self.dim))

    def __call__(self, x):
        if self.training: 
            xmean = x.mean(dim=0 , keepdim=True)
            xstd = x.std(dim=0 , keepdim=True)
            with torch.no_grad():
                self.mean_running = (1-self.momentum) * self.mean_running + self.momentum * xmean
                self.std_running = (1-self.momentum) * self.std_running + self.momentum * xstd
        else :
            xmean = self.mean_running
            xstd = self.std_running
        
        self.out = ( (x - xmean ) / (xstd + self.epsilon) ) * self.gamma + self.beta
        return self.out
        
    def parameters(self):
        return [self.gamma, self.beta]
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []
    
class Embedding:
    def __init__(self, num_emb, dim_emb):
        self.weight = torch.randn(num_emb, dim_emb)
        
    def __call__(self, x):
        self.out = self.weight[x]
        return self.out
    
    def parameters(self):
        return [self.weight]
    
class Flatten:
    def __call__(self, x):
        self.out = x.view(x.shape[0], -1)
        return self.out
    
    def parameters(self):
        return []

In [82]:
class BnMLP:
    def __init__(self,inputs, dims , b_size, hidden, outputs, n_layers):
        self.inputs = inputs
        self.dims = dims
        self.b_size = b_size
        self.hidden = hidden
        self.outputs = outputs
        self.n_layers = n_layers
        self.iterations_trained = 0
        self.current_train_loss = float('inf')
        
        self.layers = [
            Embedding(self.inputs, self.dims),
            Flatten(),
            Linear(self.dims*self.b_size, self.hidden),
            BatchNormal1D(self.hidden),
            Tanh(),
        ]
        for i in range(self.n_layers - 1):
            self.layers.append(Linear(self.hidden, self.hidden))
            self.layers.append(BatchNormal1D(self.hidden))
            self.layers.append(Tanh()) 
        self.layers.append(Linear(self.hidden, self.outputs))

        with torch.no_grad():
            for layer in self.layers[:-1]:
                if isinstance(layer, Linear):
                    layer.weight *= (5/3) # all inside (embedding) layers weight kammin init (solution 3)
                    if layer.bias is not None:
                        layer.bias *= (5/3)

            self.layers[-1].weight *= 0.1   # last layer weight init (solution 2)

        # self.parameters = [self.C] + [p for layer in self.layers for p in layer.parameters()]
        self.parameters = [p for layer in self.layers for p in layer.parameters()]
        for p in self.parameters:
            p.requires_grad = True
    
    def forward(self, X):
        h = X 
        for layer in self.layers:
            h = layer(h)
        # h.requires_grad = True
        return h 

    def backward(self, logits, y):
        loss = F.cross_entropy(logits, y)
        # for layer in self.layers:
        #     layer.out.retain_grad()
        loss.backward()
        return loss
    
    @torch.no_grad()
    def evaluate(self, X, y):
        for layer in self.layers:
            layer.training = False
        logits = self.forward(X)
        loss = F.cross_entropy(logits, y)
        for layer in self.layers:
            layer.training = True
        return loss.item()
    
    def sgd(self, X, y, alpha = 0.1 , batching_size = 32, max_iters = 1000, verbose = True):
        for iter in range(max_iters+1):
            mini_batch = torch.randint(0, X.shape[0], (batching_size,))
            # logits = self.forward(X[mini_batch])
            # loss = self.backward(logits,y[mini_batch])
            h = X[mini_batch]
            for layer in self.layers:
                h = layer(h)
            loss = F.cross_entropy(h, y[mini_batch])
            for layer in self.layers:
                layer.out.retain_grad()
            loss.backward()
            
            with torch.no_grad():
                for p in self.parameters:
                    p.retain_grad()
                    p -= p.grad * alpha
                    p.grad = None
                
            if iter > max_iters *0.95:
                alpha = alpha / 100
            elif iter > max_iters *0.9:
                alpha = alpha / 10
            
            if verbose and iter % (max_iters/10) == 0:
                print(f'{iter = :7}, loss = {loss.item():.5f}')
            
            # break 
        
        self.iterations_trained += max_iters
        self.current_train_loss = loss.item()
        return loss

    @torch.no_grad()
    def generate(self, start = '', max_len = 50):
        for layer in self.layers:
            layer.training = False
        word = '.'* self.b_size + start

        for i in range(max_len):
            x = [enc(k) for k in word[- self.b_size:]]
            x = torch.tensor([x])
            logits =  self.forward(x)
            p = F.softmax(logits, dim=1)
            word += dec(torch.multinomial(p[0], 1).item())
            if word[-1] == '.':
                break
        for layer in self.layers:
            layer.training = True
        return word[ self.b_size : -1]
    
    @torch.no_grad()
    def get_params_count(self):
        return sum([p.numel() for p in self.parameters])
    
    def __repr__(self):
        return f'BnMLP(trained iter = {self.iterations_trained} , train loss = {self.current_train_loss:.5f})'


In [83]:
bnmlp = BnMLP(
    inputs = voc_size, 
    dims = 10,
    b_size = 8,
    hidden = 100,
    outputs = voc_size,
    n_layers = 2
)

bnmlp.get_params_count()

21597

In [84]:
bnmlp.evaluate(Xtr, ytr)

3.331407308578491

In [85]:
bnmlp.sgd(Xtr, ytr, alpha = 0.1, max_iters = 1000) 

iter =       0, loss = 3.35256
iter =     100, loss = 2.15884
iter =     200, loss = 2.27844
iter =     300, loss = 2.04755
iter =     400, loss = 2.32729
iter =     500, loss = 2.21938
iter =     600, loss = 2.55699
iter =     700, loss = 1.48905
iter =     800, loss = 1.83579
iter =     900, loss = 1.77245
iter =    1000, loss = 2.28033


tensor(2.2803, grad_fn=<NllLossBackward0>)

In [86]:
bnmlp
bnmlp.evaluate(Xdev, ydev)

BnMLP(trained iter = 1000 , train loss = 2.28033)

1.9217188358306885

In [87]:
for i in range(10):
    print(bnmlp.generate(''))

mands
racumt
cretat
roonu
par
sanamy
andhoxarta
sabhaf
devit
nourak


In [88]:
bnmlp.evaluate(Xte, yte)

1.923720121383667

# Wave Net

In [49]:
def make_dataset(words, block_size = 3):
    X , y = [], []
    for word in words:
        word = '.'*block_size + word + '.'
        for i in range(len(word) - block_size):
            X.append([enc(k) for k in word[i:i+block_size]])
            y.append(enc(word[i+block_size]))
    return torch.tensor(X), torch.tensor(y)

import random

random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# Xtr,  ytr  = make_dataset(words[:n1] , block_size=8)     # 80%
# Xdev, ydev = make_dataset(words[n1:n2] , block_size=8)   # 10%
# Xte,  yte  = make_dataset(words[n2:] , block_size=8)     # 10%

# torch.save(Xtr, data_folder + 'Xtr_bs_8.pt')
# torch.save(ytr, data_folder + 'ytr_bs_8.pt')
# torch.save(Xdev, data_folder + 'Xdev_bs_8.pt')
# torch.save(ydev, data_folder + 'ydev_bs_8.pt')
# torch.save(Xte, data_folder + 'Xte_bs_8.pt')
# torch.save(yte, data_folder + 'yte_bs_8.pt')

Xtr = torch.load( data_folder + 'Xtr_bs_8.pt')
ytr = torch.load( data_folder + 'ytr_bs_8.pt')
Xdev = torch.load( data_folder + 'Xdev_bs_8.pt')
ydev = torch.load( data_folder + 'ydev_bs_8.pt')
Xte = torch.load( data_folder + 'Xte_bs_8.pt')
yte = torch.load( data_folder + 'yte_bs_8.pt')