In [72]:
import torch 
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [73]:
new_words=pd.read_csv("data/amharic_names.csv")
old_words=open("data/cleaned_words.txt","r").read().splitlines()
new_amh_words=new_words["in_am"].tolist()
am_words=new_amh_words + old_words

In [74]:
chrs=sorted(set(list("".join(am_words))))
stoi={s:i+1 for i,s in enumerate(chrs)}
stoi["."]=0
itos={i:s for s,i in stoi.items()}

In [75]:
block_size=3
def build_dataset(am_words):
    X,Y=[],[]
    for w in am_words:
        context=[0]*block_size
        for ch in w +".":
            ix=stoi[ch]
            X.append(context)
            Y.append(ix)
            context=context[1:]+[ix]
    X=torch.tensor(X)
    Y=torch.tensor(Y)
    return X,Y

import random
random.seed(42)
random.shuffle(am_words)
n1=int(0.8*(len(am_words)))
n2=int(0.9*(len(am_words)))

Xtr,Ytr=build_dataset(am_words[:n1])      #80
Xval,Yval=build_dataset(am_words[n1:n2])  #10
Xte,Yte=build_dataset(am_words[n2:])      #10

In [76]:
class Linear:
    def __init__(self,fan_in,fan_out,bias=True):
        self.weight=torch.randn((fan_in,fan_out),generator=g) / fan_in**0.5
        self.bias=torch.zeros(fan_out) if bias else None 
    def __call__(self,x):
        self.out=x @ self.weight
        if self.bias is not None:
            self.out +=self.bias
        return self.out 
    def parameters(self):
        return[self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
    def __init__(self,dim,eps=1e-5,momentum=0.1):
        self.eps=eps
        self.momentum=momentum
        self.training=True
        #parameters(train with backprop)
        self.gamma=torch.ones(dim)
        self.beta=torch.zeros(dim)
        #buffers(doesnt use grad trained with a running "momentum update")
        self.running_mean=torch.zeros(dim)
        self.running_var=torch.ones(dim)
    def __call__(self,x):
        if self.training:
            xmean=x.mean(0,keepdim=True)
            xvar=x.var(0,keepdim=True)
        else:
            xmean=self.running_mean
            xvar=self.running_mean
        xhat=(x-xmean)/torch.sqrt(xvar + self.eps) #normalization 
        self.out=self.gamma * xhat + self.beta
        #update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean=(1-self.momentum)*self.running_mean + self.momentum * xmean
                self.running_var= (1-self.momentum)*self.running_var  + self.momentum * xvar
            return self.out       
    def parameters(self):
        return [self.gamma,self.beta]

class Tanh:
    def __call__(self,x):
        self.out=torch.tanh(x)
        return self.out  
    def parameters(self):
        return []    

In [77]:
n_embed=10
n_hidden=100
block_size=3
vocab_size=len(itos)
g=torch.Generator().manual_seed(2147483647)
C=torch.randn((vocab_size,n_embed),  generator=g)

In [78]:
# 6-layers
layers=[
    Linear(n_embed*block_size, n_hidden),Tanh(),
    Linear(          n_hidden, n_hidden),Tanh(),
    Linear(          n_hidden, n_hidden),Tanh(),
    Linear(          n_hidden, n_hidden),Tanh(),
    Linear(          n_hidden, n_hidden),Tanh(),
    Linear(          n_hidden, vocab_size),
]

In [79]:
with torch.no_grad():
    layers[-1].weight*=0.1 #last layer make less confident
    for layer in layers[:-1]:
        if isinstance(layer,Linear):
            layer.weight*=5/3   #other layers apply gain
parameters=[C]+[p for layer in layers for p in layer.parameters()] 
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad=True   

66477


In [80]:
max_steps=200000
batch_size=32
lossi=[]

for i in range(max_steps):
    ix=torch.randint(0,Xtr.shape[0],(batch_size,),generator=g)
    Xb,Yb=Xtr[ix],Ytr[ix]

    embed=C[Xb]
    x=embed.view(embed.shape[0],-1) # concat
    for layer in layers:
        x=layer(x)
    loss=F.cross_entropy(x,Yb)

    for layer in layers:
        layer.out.retain_grad()
    for p in parameters:
        p.grad=None
    loss.backward()

    lr=0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data+=-lr * p.grad

    if i%10000==0:
        print(f"{i: 7d}/{max_steps: 6d}  {loss.item(): 4f}")
    lossi.append(loss.log10().item())

      0/ 200000   5.320193
  10000/ 200000   1.916625
  20000/ 200000   1.544427
  30000/ 200000   1.540077
  40000/ 200000   2.057544
  50000/ 200000   1.603221
  60000/ 200000   1.301067
  70000/ 200000   1.272900
  80000/ 200000   1.254606
  90000/ 200000   1.386509
 100000/ 200000   1.301937
 110000/ 200000   1.096691
 120000/ 200000   2.025812
 130000/ 200000   1.207332
 140000/ 200000   1.520185
 150000/ 200000   0.991740
 160000/ 200000   1.972781
 170000/ 200000   1.916506
 180000/ 200000   1.134367
 190000/ 200000   1.227143
