In [1]:
import torch
import torch.nn.functional as F 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
len(words)

32033

In [14]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)

In [17]:
n_embd = 10         # dimensionality of char embedding vetors
n_hidden = 200      # no. of neurons in hidden layer
block_size = 3

g = torch.Generator().manual_seed(2347483647)

C = torch.randn((vocab_size, n_embd), generator=g)
w1 = torch.randn((block_size * n_embd, n_hidden), generator=g)
b1 = torch.randn(n_hidden, generator=g)
w2 = torch.randn((n_hidden, vocab_size), generator=g)
b2 = torch.randn(vocab_size, generator=g)
params = [C, w1, b1, w2, b2]
for p in params:
  p.requires_grad = True

In [5]:
sum(p.nelement() for p in params)  # total params

11897

In [6]:
import random

In [7]:
def build_dataset(words):
  block_size = 3     
  X, Y = [], []
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix]  
  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

random.seed(47)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182501, 3]) torch.Size([182501])
torch.Size([22802, 3]) torch.Size([22802])
torch.Size([22843, 3]) torch.Size([22843])


In [None]:
batch_size = 32
max_steps = 10000

for i in range(max_steps):
  # mini batch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix]

  emb = C[Xb]                         # shape -> (N, 30)
  embcat = emb.view(emb.shape[0], -1) 
  hpreact = embcat @ w1 + b1          # hidden layer pre activation
  h = torch.tanh(hpreact)             # hidden layer
  logits = h @ w2 + b2                # O/P layer
  loss = F.cross_entropy(logits, Ytr[ix])

  # backward pass
  for p in params:
    p.grad = None
  loss.backward()

  # update
  lr = 0.1 if i < 5000 else 0.01
  for p in params:
    p.data += -lr * p.grad

print(loss.item())

2.267688751220703


Validation

In [19]:
emb = C[Xdev]
h = torch.tanh((emb.view(-1, 30)) @ w1 + b1)
logits = h @ w2 + b2
loss = F.cross_entropy(logits, Ydev)
print(loss)

tensor(2.3813, grad_fn=<NllLossBackward0>)


In [None]:
@torch.no_grad()          # disables gradient tracking
def split_loss(split):
  x, y = {
    'train' : (Xtr, Ytr),
    'val' : (Xdev, Ydev),
    'test' : (Xte, Yte),
  }[split]

  emb = C[x]        # (N, block_size, n_embd)
  embcat = emb.view(emb.shape[0], -1)     # change to (N, block_size * n_embd)
  h = torch.tanh(embcat @ w1 + b1 )           
  logits = h @ w2 + b2           
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.383986473083496
val 2.3813438415527344


In [23]:
# sampling

g = torch.Generator().manual_seed(2347483647 + 10)

for _ in range(10): 
  out = []
  context = [0] * block_size
  while True:
    emb = C[torch.tensor([context])]    # (1, block_size, n_embd)
    h = torch.tanh(emb.view(1, -1) @ w1 + b1)
    logits = h @ w2 + b2
    probs = F.softmax(logits, dim=1)
    # sampling from the probs to get the index
    ix = torch.multinomial(probs, num_samples=1, generator=g).item()
    context = context[1:] + [ix]
    out.append(ix)
    if ix == 0:
      break
  print(''.join(itos[i] for i in out))

kalex.
tyry.
kri.
sheluaharael.
avix.
joshaylandoanilie.
anhainc.
hidy.
kumataisyy.
amarama.
