In [4]:
words = open('names.txt', 'r').read().splitlines()

In [5]:
words

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn',
 'abigail',
 'emily',
 'elizabeth',
 'mila',
 'ella',
 'avery',
 'sofia',
 'camila',
 'aria',
 'scarlett',
 'victoria',
 'madison',
 'luna',
 'grace',
 'chloe',
 'penelope',
 'layla',
 'riley',
 'zoey',
 'nora',
 'lily',
 'eleanor',
 'hannah',
 'lillian',
 'addison',
 'aubrey',
 'ellie',
 'stella',
 'natalie',
 'zoe',
 'leah',
 'hazel',
 'violet',
 'aurora',
 'savannah',
 'audrey',
 'brooklyn',
 'bella',
 'claire',
 'skylar',
 'lucy',
 'paisley',
 'everly',
 'anna',
 'caroline',
 'nova',
 'genesis',
 'emilia',
 'kennedy',
 'samantha',
 'maya',
 'willow',
 'kinsley',
 'naomi',
 'aaliyah',
 'elena',
 'sarah',
 'ariana',
 'allison',
 'gabriella',
 'alice',
 'madelyn',
 'cora',
 'ruby',
 'eva',
 'serenity',
 'autumn',
 'adeline',
 'hailey',
 'gianna',
 'valentina',
 'isla',
 'eliana',
 'quinn',
 'nevaeh',
 'ivy',
 'sadie',
 'piper',
 'lydia',
 'alexa',
 'josephine',
 'emery',
 'julia'

In [43]:
import torch
import torch.nn.functional as F

In [10]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [11]:
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [78]:
# create the training set of trigrams (x,y)
xs, ys = [], []

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = len(xs)

In [97]:
xs

tensor([[ 0,  5],
        [ 5, 13],
        [13, 13],
        ...,
        [26, 25],
        [25, 26],
        [26, 24]])

In [79]:
print(f"number of examples: {num}")

number of examples: 196113


In [87]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [99]:
xenc = F.one_hot(xs, num_classes=27).float().view(-1, 27*2)
xenc

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 1., 0., 0.]])

In [95]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float().view(-1, 27*2)
  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdims=True)
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean() # the + ... is regularization
  print(loss.item())
  
  #backward pass
  W.grad = None
  loss.backward()
  
  #update
  W.data += -50 * W.grad

2.2593140602111816
2.2592546939849854
2.2591967582702637
2.259139060974121
2.259082317352295
2.259025812149048
2.258970260620117
2.258915424346924
2.2588608264923096
2.2588071823120117
2.258754253387451
2.258701801300049
2.258650064468384
2.258598804473877
2.25854754447937
2.258497714996338
2.2584481239318848
2.25839900970459
2.258350372314453
2.2583024501800537
2.2582550048828125
2.2582080364227295
2.2581615447998047
2.258115768432617
2.258070230484009
2.2580254077911377
2.2579808235168457
2.257936716079712
2.2578933238983154
2.257850170135498
2.257807731628418
2.257765531539917
2.257723808288574
2.2576828002929688
2.2576417922973633
2.257601499557495
2.257561445236206
2.257521867752075
2.2574825286865234
2.25744366645813
2.2574055194854736
2.2573676109313965
2.2573301792144775
2.2572929859161377
2.257255792617798
2.2572197914123535
2.25718355178833
2.257148027420044
2.257112741470337
2.25707745552063
2.2570431232452393
2.2570087909698486
2.256974935531616
2.256941318511963
2.25690817

In [117]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    ix1 = 0
    ix2 = torch.multinomial(torch.ones(27) / 27, 1, replacement=True, generator=g).item()  # Randomly choose the second character
    out = [itos[ix2]]
    while True:
      xenc = F.one_hot(torch.tensor([ix1, ix2]), num_classes=27).float().view(-1, 27*2)
      logits = xenc @ W
      counts = logits.exp()
      p = counts / counts.sum(1, keepdims=True)
      
      ix1 = ix2 
      ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
      out.append(itos[ix2])
      if ix2 == 0:
        break
            
    print("".join(out))

cexze.
zomakurailaziaydamellimittain.
lusan.
ka.
da.
samiyaubrtthrigotai.
iolielliaugie.
teda.
kaleyla.
sade.
enkavirny.
fobspih.
ciden.
tahlasu.
dydr.
breegl.
peig.
iatta.
ra.
dinne.


# E02 - Dataset split up

In [149]:
from torch.utils.data import random_split
train_size = int(0.8 * len(words))
dev_size = int(0.1 * len(words))
test_size = int(len(words) - train_size - dev_size)

train_set, dev_set, test_set = random_split(words, [train_size, dev_size, test_size], generator=g)

In [192]:
# create the training set of trigrams (x,y)
xs, ys = [], []

for w in train_set:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = len(xs)

def random_weights_initialize():
  # randomly initialize 27 neurons' weights. each neuron receives 27 inputs
  g = torch.Generator().manual_seed(2147483647)
  W = torch.randn((27*2, 27), generator=g, requires_grad=True)
  return W

In [211]:
# gradient descent
def gradient_descent(W, reg_fac):
  for _ in range(100):
    
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float().view(-1, 27*2)
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), ys].log().mean() + reg_fac*(W**2).mean() # the + ... is regularization
    #print(loss.item())
    
    #backward pass
    W.grad = None
    loss.backward()
    
    #update
    W.data += -50 * W.grad
      
  return loss.item() 

In [194]:
dev_eval_x, dev_eval_y = [], []

for w in dev_set:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    dev_eval_x.append((ix1, ix2))
    dev_eval_y.append(ix3)
      
dev_eval_x = torch.tensor(dev_eval_x)
dev_eval_y = torch.tensor(dev_eval_y)
dev_num = len(dev_eval_x)
    
test_eval_x, test_eval_y = [], []

for w in test_set:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    test_eval_x.append((ix1, ix2))
    test_eval_y.append(ix3)
      
test_eval_x = torch.tensor(test_eval_x)
test_eval_y = torch.tensor(test_eval_y)
test_num = len(test_eval_x)

In [213]:
def get_loss(xs, ys, num, regularization_factor):
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float().view(-1, 27*2)
  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdims=True)
  loss = -probs[torch.arange(num), ys].log().mean()# + regularization_factor*(W**2).mean()
  return loss.item()

In [196]:
print(get_loss(dev_eval_x, dev_eval_y, dev_num, 0.01))
print(get_loss(test_eval_x, test_eval_y, test_num, 0.01))

2.2535791397094727
2.256826877593994


# E03 - Tuning regularization using DevSet

In [214]:
best_loss = (10000000, 0)
for reg_fac in list(torch.arange(0, 0.05, 0.005)) + list(torch.arange(0.05, 0.5, 0.05)):
  W = random_weights_initialize()
  train_loss = gradient_descent(W, reg_fac)
  dev_loss = get_loss(dev_eval_x, dev_eval_y, dev_num, reg_fac)
  print(f"For a Regularization factor of: {reg_fac:.3f}\nThe train loss is: {train_loss:.4f}\nThe Dev loss is: {dev_loss:.4f}\n-=-=-=-=-=-=-=-=-=-=-=-")
  if dev_loss < best_loss[0]:
    best_loss = (dev_loss, reg_fac)
      
print(f"The best Dev loss is: {best_loss[0]:.4f} for a regularization factor of: {best_loss[1]:.3f}")

For a Regularization factor of: 0.000
The train loss is: 2.2395
The Dev loss is: 2.2341
-=-=-=-=-=-=-=-=-=-=-=-
For a Regularization factor of: 0.005
The train loss is: 2.2475
The Dev loss is: 2.2347
-=-=-=-=-=-=-=-=-=-=-=-
For a Regularization factor of: 0.010
The train loss is: 2.2540
The Dev loss is: 2.2357
-=-=-=-=-=-=-=-=-=-=-=-
For a Regularization factor of: 0.015
The train loss is: 2.2597
The Dev loss is: 2.2369
-=-=-=-=-=-=-=-=-=-=-=-
For a Regularization factor of: 0.020
The train loss is: 2.2648
The Dev loss is: 2.2382
-=-=-=-=-=-=-=-=-=-=-=-
For a Regularization factor of: 0.025
The train loss is: 2.2696
The Dev loss is: 2.2395
-=-=-=-=-=-=-=-=-=-=-=-
For a Regularization factor of: 0.030
The train loss is: 2.2740
The Dev loss is: 2.2409
-=-=-=-=-=-=-=-=-=-=-=-
For a Regularization factor of: 0.035
The train loss is: 2.2782
The Dev loss is: 2.2422
-=-=-=-=-=-=-=-=-=-=-=-
For a Regularization factor of: 0.040
The train loss is: 2.2822
The Dev loss is: 2.2436
-=-=-=-=-=-=-=-=