In [1]:
import torch
import torch.nn.functional as F

In [2]:
with open("../makemore/names.txt") as f:
    words = f.read().splitlines()

In [3]:
len(words)

32033

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {v:k for k,v in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


## Construct Dataset 

In [5]:

def build_dataset(words, block_size=3):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y


In [6]:
# Convert to tensors
import random 
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr , Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [7]:
from dataclasses import dataclass

In [8]:
@dataclass
class Config:
    generator =  g = torch.Generator().manual_seed(2147483647)
    hidden_layer_size: int = 100
    lr : float = 0.1
    batch_size: int = 32
    lookback: int = 3
    embedding_size: float = 2
    vocab_size: float  =  27

### Embedded Input 

### Questions
- How do we decide on the size of the embedding?
- IS there something unique about language that makes embeddings so good here?

## Create the Neural Net

In [9]:
class Linear:
    def __init__(self,fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=Config.generator)
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out =     x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

In [10]:
# We want a 2 vector embedding. Start with random embeddings for all 27 chars
C = torch.randn(Config.vocab_size ,Config.embedding_size, generator=Config.generator)

layers = [ 
    # the input of this should be 6 because we have 3 previous characters each with 2 dimensional embedding
    Linear(Config.embedding_size * Config.lookback, Config.hidden_layer_size), Tanh(),
    # Final Layer should output "probabilities" for each character
    Linear(Config.hidden_layer_size, Config.vocab_size)
            ]

params = [C] + [p for layer in layers for p in layer.parameters()]

for p in params:
    p.requires_grad = True

##  Train

In [23]:
from tqdm import tqdm

In [31]:
Xtr

tensor([[ 0,  0,  0],
        [ 0,  0, 25],
        [ 0, 25, 21],
        ...,
        [15, 12,  4],
        [12,  4,  1],
        [ 4,  1, 14]])

In [24]:
for i in tqdm(range(30000)):
    # Forward Pass
    # Minibatch Construct
    ix = torch.randint(0, Xtr.shape[0], (Config.batch_size,))
    # Go from index input to embedding input
    emb = C[Xtr[ix]]
    # flatten embedding to multiply with hidden layer
    x = emb.view(emb.shape[0], -1 )
    # Hidden Layer ouput
    for layer in layers:
        x =  layer(x)
    loss = F.cross_entropy(x, Ytr[ix])
    if i%1000 == 0:
        print(loss) 
    
    # Backprop
    for p in params:
        p.grad = None
    loss.backward()
    for p in params:
        p.data += -Config.lr * p.grad 
    break

  0%|                                                 | 0/30000 [00:00<?, ?it/s]

tensor(13.1755, grad_fn=<NllLossBackward0>)





### Neural Network Plots

In [30]:
layers[0].bias

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.], requires_grad=True)

In [21]:
import pandas as pd
import altair as alt

In [27]:
layers[0].parameters()

[tensor([[ 6.1690e-01,  1.5160e+00,  2.4720e-01, -3.7767e-01, -1.9081e+00,
          -3.7170e-01, -9.8378e-01, -1.5256e-01, -6.2787e-01,  7.7023e-02,
          -1.9911e+00, -1.3050e+00, -1.3792e+00, -3.0560e-01, -5.4209e-01,
          -5.9234e-01,  1.0358e+00, -8.6249e-01,  7.8068e-01,  3.8314e-01,
           1.4232e+00, -3.6390e-01,  9.4754e-02, -1.1645e+00, -7.2759e-01,
           1.1491e+00, -1.1769e+00,  2.6542e-01, -7.1122e-01,  1.0894e+00,
          -1.8007e-01,  1.3034e+00, -2.1057e+00, -2.6044e-01,  8.1229e-01,
           1.2772e+00,  1.8313e-01,  1.3504e+00,  1.3348e+00,  4.9849e-01,
          -5.6533e-01,  5.4281e-01,  1.2600e+00, -6.2020e-01,  1.4325e+00,
          -1.0632e+00, -6.0596e-01,  9.1776e-01,  9.0187e-01,  1.2028e+00,
          -3.7941e-01, -9.8748e-01, -1.1833e+00,  1.5222e+00,  4.3905e-01,
           1.6933e-01, -1.4777e+00, -5.1029e-01, -1.7485e+00,  6.7062e-01,
           3.3509e-01, -5.3509e-01, -1.1953e+00, -7.0163e-01,  3.8761e-01,
           4.7236e-01, -3

In [28]:
pd.DataFrame(layers[0].parameters(), columns=["weights"])

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [23]:
alt.Chart(data=pd.DataFrame(layers[0].parameters()), columns="weights").mark_bar().encode(x{})

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [17]:
charts = []
alt.layers[0].parameters()
# for layer in layers[:-1]:
#     if type(layer) == Tanh:
        
    
    

[tensor([[ 6.1690e-01,  1.5160e+00,  2.4720e-01, -3.7767e-01, -1.9081e+00,
          -3.7170e-01, -9.8378e-01, -1.5256e-01, -6.2787e-01,  7.7023e-02,
          -1.9911e+00, -1.3050e+00, -1.3792e+00, -3.0560e-01, -5.4209e-01,
          -5.9234e-01,  1.0358e+00, -8.6249e-01,  7.8068e-01,  3.8314e-01,
           1.4232e+00, -3.6390e-01,  9.4754e-02, -1.1645e+00, -7.2759e-01,
           1.1491e+00, -1.1769e+00,  2.6542e-01, -7.1122e-01,  1.0894e+00,
          -1.8007e-01,  1.3034e+00, -2.1057e+00, -2.6044e-01,  8.1229e-01,
           1.2772e+00,  1.8313e-01,  1.3504e+00,  1.3348e+00,  4.9849e-01,
          -5.6533e-01,  5.4281e-01,  1.2600e+00, -6.2020e-01,  1.4325e+00,
          -1.0632e+00, -6.0596e-01,  9.1776e-01,  9.0187e-01,  1.2028e+00,
          -3.7941e-01, -9.8748e-01, -1.1833e+00,  1.5222e+00,  4.3905e-01,
           1.6933e-01, -1.4777e+00, -5.1029e-01, -1.7485e+00,  6.7062e-01,
           3.3509e-01, -5.3509e-01, -1.1953e+00, -7.0163e-01,  3.8761e-01,
           4.7236e-01, -3

In [16]:
emb = C[Xdev]
# flatten embedding to multiply with hidden layer
x = emb.view(-1, 6)
# Hidden Layer ouput
for layer in layers:
    x = layer(x)
# Cross Entropy
loss = F.cross_entropy(x, Ydev)
print(loss)


tensor(2.3740, grad_fn=<NllLossBackward0>)


## Questions
- Is how we tokenize also a hyperaparameter?
- Is the subsitution in language a necessary condition for embeddings to work?
- Is the loss a good enough rep

In [17]:
# sampling from the model

In [18]:
num_words = 5
block_size = 3

In [19]:
'

SyntaxError: unterminated string literal (detected at line 1) (544193499.py, line 1)

In [None]:
context = [0] *3
emb = C[context]

In [None]:
for i in range(num_words):
    context = [0] * block_size
    word = ""
    ix = -1
    while ix != 0:
        emb = C[context]
        # flatten embedding to multiply with hidden layer
        flatemb = emb.view(-1, lookback * embedding_size)
        # Hidden Layer ouput
        h = torch.tanh(flatemb @ w1 + b1)
        # Logits
        h = h @ w2 + b2
        probs = torch.softmax(h, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        word += itos[ix]
        context = context[1:] + [ix]
    print (word)