In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
%matplotlib inline 

In [2]:
# import names
words = open('../names.txt', 'r').read().splitlines()

In [3]:
# create a relation from unique caracther to integer e viceversa
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [4]:
# create the training set and dev set
block_size = 3
Xtr,Ytr = [], []
Xdev, Ydev = [], []

def build_database(words):
    X,Y = [], []
    for w in words:
    #     print(w)
        context = [0] * block_size
        for c in w + '.':
            ix = stoi[c]
            X.append(context)
            Y.append(ix)
    #         print(f"{''.join(itos[i] for i in context)} ---> {c}")
            context = context[1:] + [ix]
    X = torch.tensor(X, device = device)
    Y = torch.tensor(Y, device = device)
    print(X.shape,Y.shape)
    return X,Y

import random
random.seed(2147483647)
random.shuffle(words)
n1 = int(0.8*len(words)) # 80% will be training set
Xtr,Ytr = build_database(words[:n1])
Xdev, Ydev = build_database(words[n1+1:])

torch.Size([182546, 3]) torch.Size([182546])
torch.Size([45592, 3]) torch.Size([45592])


In [5]:
# create the layers
g = torch.Generator(device = device).manual_seed(2147483647)
C = torch.randn(27,10, generator = g, device = device)
W1 = torch.randn(30,600, generator = g, device = device)
b1 = torch.randn(600, generator = g, device = device)
W2 = torch.randn(600,27, generator = g, device = device)
b2 = torch.randn(27, generator = g, device = device)
parameters = [C,W1,b1,W2,b2]
# required
for p in parameters:
    p.requires_grad = True

In [6]:
sum(p.nelement() for p in parameters)

35097

In [7]:
# stats
lossi = []
cycles = []

In [8]:
# training
epoch = 500000
for i in range(epoch):
    # minibatch
    ix = torch.randint(0,Xtr.shape[0], ((32 if i < int(0.5*epoch) else 200),)) # prevela 32 esempi rand da X
    
    # forward pass
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(-1,30) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])
    loss
    
    #bacward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    #update
    for p in parameters:
        p.data += -(0.1 if i < int(0.6*epoch) else 0.05) * p.grad # how to determind learning rate
    
    # stats
    cycles.append(i)
    lossi.append(loss.log10().item())

plt.plot(cycles,lossi)
print("loss of just the minibacth: ",loss.item())

KeyboardInterrupt: 

In [None]:
# loss on tr set
emb = C[Xtr]
h = torch.tanh(emb.view(-1,30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
print("loss of all tr set:", loss.item())

In [None]:
h.device

In [None]:
loss.device

In [None]:
# loss on dev set
emb = C[Xdev]
h = torch.tanh(emb.view(-1,30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print("loss of dev set:", loss.item())

In [None]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 12)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))
    