In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
words = open('names.txt','r').read().splitlines()

In [6]:
len(words)

32033

In [7]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [115]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y


import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [116]:
Xtr.shape,Ytr.shape

(torch.Size([182625, 3]), torch.Size([182625]))

In [145]:
C = torch.randn([27,10])
W1 = torch.randn(30,200)
b1 = torch.randn(200)
W2 = torch.randn([200,27])
b2 = torch.randn(27)
parameters = [C,W1,b1,W2,b2]

In [146]:
for p in parameters:
    p.requires_grad = True

In [155]:
for _ in range(50000):
    # Forward Pass

    # Constructing Minibatch
    ix = torch.randint(0,Xtr.shape[0],(32,))

    emb = C[Xtr[ix]] # we can see it as one hot encoding going through layer 1 like the bigram model 

    # Now We'll work on the hidden layer
    # emb @ W1 + b
    h = torch.tanh(emb.view([-1,30]) @ W1 + b1)

    # Now we'll work on the output layer
    logits = (h @ W2) + b2

    # Calculating loss
    loss = F.cross_entropy(logits,Ytr[ix])

    # Backward Pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Updating 
    for p in parameters:
        p.data += -0.01 * p.grad

loss.item() # this is the loss for just that batch


2.1010870933532715

In [156]:
# Calculating loss for NN
emb = C[Xtr] 
h = torch.tanh(emb.view([-1,30]) @ W1 + b1)
logits = (h @ W2) + b2
loss = F.cross_entropy(logits,Ytr)
loss

tensor(2.1603, grad_fn=<NllLossBackward0>)

In [157]:
# Calculating loss for NN
emb = C[Xdev] 
h = torch.tanh(emb.view([-1,30]) @ W1 + b1)
logits = (h @ W2) + b2
loss = F.cross_entropy(logits,Ydev)
loss

tensor(2.1898, grad_fn=<NllLossBackward0>)

In [158]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

carlah.
ami.
harif.
jari.
remil.
skaan.
kena.
hutn.
den.
arcia.
qui.
ner.
kiah.
maiiv.
kaleigh.
ham.
prin.
quint.
shon.
marianni.
