In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [4]:
# build the vocabulary of characters and mappings to/from integers
words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
  
  #print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    #print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)


block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [16]:
def train_model(embedding_size=10, neurons_amount=200, runtime=200000, minibatch_size=32, learning_decay_division=2):
    g = torch.Generator().manual_seed(2147483647) # for reproducibility
    C = torch.randn((27, embedding_size), generator=g)
    W1 = torch.randn((block_size * embedding_size, neurons_amount), generator=g)
    b1 = torch.randn(neurons_amount, generator=g)
    W2 = torch.randn((neurons_amount, 27), generator=g)
    b2 = torch.randn(27, generator=g)
    parameters = [C, W1, b1, W2, b2]
    for p in parameters:
      p.requires_grad = True
    
    for i in range(runtime):
      
      # minibatch construct
      ix = torch.randint(0, Xtr.shape[0], (minibatch_size,))
      
      # forward pass
      emb = C[Xtr[ix]] # (32, 3, 10)
      h = torch.tanh(emb.view(-1, block_size * embedding_size) @ W1 + b1) # (32, 200)
      logits = h @ W2 + b2 # (32, 27)
      loss = F.cross_entropy(logits, Ytr[ix])
      #print(loss.item())
      
      # backward pass
      for p in parameters:
        p.grad = None
      loss.backward()
      
      # update
      #lr = lrs[i]
      lr = 0.1 if i < (runtime / learning_decay_division) else 0.01
      for p in parameters:
        p.data += -lr * p.grad
    
      # track stats
      #lri.append(lre[i])
      #stepi.append(i)
      #lossi.append(loss.log10().item())
    
    #print(loss.item())
    
    #Find Training set loss, and Dev Loss
    emb = C[Xtr] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, block_size * embedding_size) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    trloss = F.cross_entropy(logits, Ytr)
    #print(f"Training loss is: {trloss}")
    
    emb = C[Xdev] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, block_size * embedding_size) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    devloss = F.cross_entropy(logits, Ydev)
    #print(f"Dev loss is: {devloss}")
    return trloss, devloss

In [8]:
train_model()

Training loss is: 2.115941047668457
Dev loss is: 2.16377592086792


In [17]:
def test_attribute(embedding_size=10, neurons_amount=200, runtime=200000, minibatch_size=32, learning_decay_division=2):
    trlosses = []
    devlosses = []
    for i in range(5):
        trloss, devloss = train_model(embedding_size, neurons_amount, runtime, minibatch_size, learning_decay_division)
        trlosses.append(trloss.item())
        devlosses.append(devloss.item())
        
    average_trloss = sum(trlosses) / len(trlosses) 
    average_devloss = sum(devlosses) / len(devlosses)
    print(f"Average Training loss is: {average_trloss}")
    print(f"Average Dev loss is: {average_devloss}")

In [19]:
test_attribute()

Average Training loss is: 2.1226444244384766
Average Dev loss is: 2.1651772975921633


In [18]:
print("Embedding Size of 20")
test_attribute(embedding_size=20)
print("-=-=-=-=-=-=-=-=-=-")

print("Neurons amount of 300")
test_attribute(neurons_amount=300)
print("-=-=-=-=-=-=-=-=-=-")

print("Runtime of 300.000")
test_attribute(runtime=300000)
print("-=-=-=-=-=-=-=-=-=-")

print("Minibatch size of 50")
test_attribute(minibatch_size=50)
print("-=-=-=-=-=-=-=-=-=-")

print("Runtime of 300.000 with /3 Learning Decay")
test_attribute(runtime=300000, learning_decay_division=3)
print("-=-=-=-=-=-=-=-=-=-")

print("Embedding Size of 5 with 300 Neurons")
test_attribute(embedding_size=5, neurons_amount=300)
print("-=-=-=-=-=-=-=-=-=-")

print("Embedding Size of 20 with 300 Neurons")
test_attribute(embedding_size=20, neurons_amount=300)

Embedding Size of 20
Average Training loss is: 2.0694377422332764
Average Dev loss is: 2.1441296577453612
-=-=-=-=-=-=-=-=-=-
Neurons amount of 300
Average Training loss is: 2.1129473209381104
Average Dev loss is: 2.175904893875122
-=-=-=-=-=-=-=-=-=-
Runtime of 300.000
Average Training loss is: 2.0991374015808106
Average Dev loss is: 2.147870683670044
-=-=-=-=-=-=-=-=-=-
Minibatch size of 50
Average Training loss is: 2.118647813796997
Average Dev loss is: 2.1614403247833254
-=-=-=-=-=-=-=-=-=-
Runtime of 300.000 with /3 Learning Decay
Average Training loss is: 2.1194077014923094
Average Dev loss is: 2.161695051193237
-=-=-=-=-=-=-=-=-=-
Embedding Size of 5 with 300 Neurons
Average Training loss is: 2.1654847145080565
Average Dev loss is: 2.1876537799835205
-=-=-=-=-=-=-=-=-=-
Embedding Size of 20 with 300 Neurons
Average Training loss is: 2.0419331073760985
Average Dev loss is: 2.1453221321105955


In [20]:
print("Embedding Size of 50")
test_attribute(embedding_size=50)
print("-=-=-=-=-=-=-=-=-=-")

print("Neurons amount of 600")
test_attribute(neurons_amount=600)
print("-=-=-=-=-=-=-=-=-=-")

print("Embedding Size of 100")
test_attribute(embedding_size=100)
print("-=-=-=-=-=-=-=-=-=-")

print("Neurons amount of 1000")
test_attribute(neurons_amount=1000)
print("-=-=-=-=-=-=-=-=-=-")

Embedding Size of 50
Average Training loss is: 2.047200918197632
Average Dev loss is: 2.140800142288208
-=-=-=-=-=-=-=-=-=-
Neurons amount of 600
Average Training loss is: 2.120588445663452
Average Dev loss is: 2.2023672103881835
-=-=-=-=-=-=-=-=-=-
Embedding Size of 100
Average Training loss is: 2.0443153858184813
Average Dev loss is: 2.1395382404327394
-=-=-=-=-=-=-=-=-=-
Neurons amount of 1000
Average Training loss is: 2.105420637130737
Average Dev loss is: 2.244928312301636
-=-=-=-=-=-=-=-=-=-


In [None]:
print("Embedding size of 50, Runtime of 500000, Learning decay of /1.5")
test_attribute(embedding_size=50, runtime=500000, learning_decay_division=1.5)

Embedding size of 50, Runtime of 5000000, Learning decay of /1.5
