In [81]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
import random
random.seed(42)
%matplotlib inline

# Check if GPU is available
output_device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Import data
names = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(names))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i+1:s for i,s in enumerate(chars)}
itos[0] = '.'

In [82]:
# build the dataset
block_size = 5 # context length: how many characters do we take to predict the next one?

def build_dataset(names):  
  X, Y = [], []
  for w in names:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y


random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr, Ytr = build_dataset(names[:n1]) # training set 80%
Xdev, Ydev = build_dataset(names[n1:n2]) # dev set 10%
Xte, Yte = build_dataset(names[n2:]) # test set 10%

torch.Size([182625, 5]) torch.Size([182625])
torch.Size([22655, 5]) torch.Size([22655])
torch.Size([22866, 5]) torch.Size([22866])


In [83]:
# initialize the model
lyr_sz = 200 # size of the hidden layer
emb_dim = 15 # diemnsion of the character embeddings

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,emb_dim), generator=g).requires_grad_(True) # 27 possible characters, embeded into emb_dim dimensions
W1 = torch.randn((block_size*emb_dim,lyr_sz), generator=g).requires_grad_(True) # 3 context chars, emb_dim dimensions per char, lyr_sz hidden weights
b1 = torch.randn(lyr_sz, generator=g).requires_grad_(True) # lyr_sz hidden biases
W2 = torch.randn((lyr_sz,27), generator=g).requires_grad_(True) # lyr_sz hidden weights, 27 possible characters
b2 = torch.randn(27, generator=g).requires_grad_(True) # 27 output biases
parameters = [C, W1, b1, W2, b2] # list of all model parameters

print("total # elements:", sum(p.nelement() for p in parameters)) # number of parameters

total # elements: 103432


In [89]:
lre = torch.linspace(-3,0,itr)
lrs = 10**lre
lri = []
lossi = []
stepi = []
itr = 250000 # number of iterations

In [113]:
# Gradient Descent
for i in range(itr):
    # mini-batch construction
    batch_size = 100
    ix = torch.randint(0, Xtr.shape[0], (batch_size,)) # batch_size random index into the dataset

    # Forward pass
    emb = C[Xtr[ix]] # look up embeddings
    h = torch.tanh(emb.view(-1,block_size*emb_dim) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])

    # Backward pass
    loss.backward()

    # Update weights
    lr = 0.001
    # lr = lrs[i]
    for p in parameters:
        p.data += -lr * p.grad
        p.grad = None
    
    #track stats
    #lri.append(lre[i])
    #stepi.append(i)
    #lossi.append(loss.log10().item())

#print('mini-batch loss:', loss.item())

In [114]:
# training set loss
emb = C[Xtr]
h = torch.tanh(emb.view(-1,block_size*emb_dim) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
print('training set loss:', loss.item())

# dev set loss
emb = C[Xdev] # look up embeddings
h = torch.tanh(emb.view(-1,block_size*emb_dim) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print('dev set loss:', loss.item())

# test set loss
emb = C[Xte]
h = torch.tanh(emb.view(-1,block_size*emb_dim) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Yte)
print('test set loss:', loss.item())

training set loss: 1.648150086402893
dev set loss: 2.518132209777832
test set loss: 2.5112078189849854


In [115]:
#plt.plot(stepi, lossi)

In [116]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      if ix == 0:
        break
      out.append(ix)
    
    print(''.join(itos[i] for i in out))

carmah
amelia
khyrin
xithik
skanden
jazhauna
perric
kaeli
nella
archer
vivan
jeph
bronsey
quintin
lilta
jadis
wane
madia
yoniel
euphoriya
