In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
torch.manual_seed(42)

words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32658
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [3]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 27: 'ó', 28: 'ę', 29: 'ł', 30: 'ń', 31: 'ś', 32: 'ż', 0: '.'}
33


In [29]:
import random
random.seed(42)
random.shuffle(words)

block_size = 8 # in paper they used block size 16, but because we are predicting names they usually are smaller than 16 characters

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] 

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print("X shape:", X.shape)  # Expected: (num_samples, block_size)
  print("Y shape:", Y.shape)  # Expected: (num_samples,)
  return X, Y

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtrain,  Ytrain  = build_dataset(words[:n1])    
Xdev, Ydev = build_dataset(words[n1:n2])  
Xtest,  Ytest  = build_dataset(words[n2:])  

X shape: torch.Size([186474, 8])
Y shape: torch.Size([186474])
X shape: torch.Size([23373, 8])
Y shape: torch.Size([23373])
X shape: torch.Size([23205, 8])
Y shape: torch.Size([23205])


In [5]:
for x,y in zip(Xtrain[:20], Ytrain[:20]):
  print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

........ --> p
.......p --> r
......pr --> z
.....prz --> e
....prze --> m
...przem --> y
..przemy --> s
.przemys --> ł
przemysł --> a
rzemysła --> w
zemysław --> .
........ --> k
.......k --> e
......ke --> i
.....kei --> r
....keir --> r
...keirr --> a
..keirra --> .
........ --> a
.......a --> u


In [6]:


class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) / (fan_in**0.5) 
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True

        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0,1)

            xmean = x.mean(dim=dim, keepdim=True)
            xvar = x.var(dim=dim, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar

        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    
class LeakyRelu:
    def __call__(self, x):
        self.out = torch.nn.functional.leaky_relu(x)
        return self.out
    
    def parameters(self):
        return []
    
class Relu:
    def __call__(self, x):
        self.out = torch.nn.functional.relu(x)
        return self.out
    
    def parameters(self):
        return []
    

class Adam:
    def __init__(self, params, lr=0.001, beta1 = 0.9, beta2 = 0.999, eps = 1e-08):
        self.lr = lr
        self.params = params
        self.beta1 = torch.tensor(beta1)
        self.beta2 = torch.tensor(beta2)
        self.eps = eps
        self.m_d = {id(p) : torch.zeros_like(p) for p in params}
        self.v_d = {id(p) : torch.zeros_like(p) for p in params}
        self.t = 1

    def step(self):
        for param in self.params:
            grad = param.grad
            m = self.m_d[id(param)]
            v = self.v_d[id(param)]

            next_m = (torch.multiply(self.beta1, m) + torch.multiply(1.0 - self.beta1, grad))
            next_v = (torch.multiply(self.beta2, v) + torch.multiply(1.0 - self.beta2, torch.pow(grad, 2)))

            m_hat = torch.divide(next_m, (1 - torch.pow(self.beta1, self.t)))
            v_hat = torch.divide(next_v, (1 - torch.pow(self.beta2, self.t)))

            param.data = param.data - torch.divide(torch.multiply(self.lr, m_hat), (torch.sqrt(v_hat) + self.eps))

            self.m_d[id(param)] = next_m
            self.v_d[id(param)] = next_v
        self.t += 1


class Embedding:
    def __init__(self, num_embeddings, embeddings_dim):
        self.weight = torch.randn((num_embeddings, embeddings_dim))

    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    
    def parameters(self):
        return [self.weight]
    

class Flatten:
    def __init__(self, n):
        self.n = n

    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out

    def parameters(self):
        return []

class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [7]:
n_embd = 24
n_hidden = 128

model = Sequential([
    Embedding(vocab_size, n_embd),
    Flatten(2), Linear(n_embd * 2, n_hidden, bias = False), BatchNorm1d(n_hidden), LeakyRelu(),
    Flatten(2), Linear(n_hidden * 2, n_hidden, bias = False), BatchNorm1d(n_hidden), LeakyRelu(),
    Flatten(2), Linear(n_hidden * 2, n_hidden, bias = False), BatchNorm1d(n_hidden), LeakyRelu(),
    Linear(n_hidden, vocab_size),
])

with torch.no_grad():
    model.layers[-1].weight *= 0.1

parameters = model.parameters()
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

77497


In [31]:
ix = torch.randint(0, Xtrain.shape[0], (4,))
Xb, Yb = Xtrain[ix], Ytrain[ix]
logits = model(Xb)
print(Xb.shape)
Xb

torch.Size([4, 8])


tensor([[ 0,  0,  8,  1, 18, 12,  5, 14],
        [ 0, 20, 26,  9, 16, 16, 15, 18],
        [ 0,  0,  0, 19,  8,  1, 14,  5],
        [ 0,  0,  0,  0,  0, 13,  1, 18]])

In [9]:
print(model.layers[0].out.shape)
print(model.layers[1].out.shape)
print(model.layers[2].out.shape)    

torch.Size([4, 8, 24])
torch.Size([4, 4, 48])
torch.Size([4, 4, 128])


In [10]:
for layer in model.layers:
    print(layer.__class__.__name__, ":", tuple(layer.out.shape))

Embedding : (4, 8, 24)
Flatten : (4, 4, 48)
Linear : (4, 4, 128)
BatchNorm1d : (4, 4, 128)
LeakyRelu : (4, 4, 128)
Flatten : (4, 2, 256)
Linear : (4, 2, 128)
BatchNorm1d : (4, 2, 128)
LeakyRelu : (4, 2, 128)
Flatten : (4, 256)
Linear : (4, 128)
BatchNorm1d : (4, 128)
LeakyRelu : (4, 128)
Linear : (4, 33)


In [21]:
print(Xb.shape)

torch.Size([32, 8])


In [25]:
print(model.layers[0].weight.shape)

torch.Size([33, 24])


In [28]:
layer_emb = model.layers[0]
print(layer_emb( Xb ).shape)
print(layer_emb.out)

torch.Size([32, 8, 24])
tensor([[[ 1.9282,  1.4803,  0.9018,  ...,  1.0649,  0.8136,  1.6325],
         [ 1.9282,  1.4803,  0.9018,  ...,  1.0649,  0.8136,  1.6325],
         [ 1.9282,  1.4803,  0.9018,  ...,  1.0649,  0.8136,  1.6325],
         ...,
         [ 0.4646, -1.1777,  0.0978,  ...,  0.3833,  0.7292,  1.1939],
         [-0.4620,  1.0813, -0.4895,  ..., -1.0283, -0.6695,  0.8262],
         [ 0.0053,  0.3012,  0.1648,  ...,  1.1496, -0.7225,  0.6453]],

        [[ 1.9282,  1.4803,  0.9018,  ...,  1.0649,  0.8136,  1.6325],
         [ 1.9282,  1.4803,  0.9018,  ...,  1.0649,  0.8136,  1.6325],
         [ 1.9282,  1.4803,  0.9018,  ...,  1.0649,  0.8136,  1.6325],
         ...,
         [ 0.1649, -2.1314, -0.1511,  ...,  0.2394, -0.5500,  0.0448],
         [ 0.1649, -2.1314, -0.1511,  ...,  0.2394, -0.5500,  0.0448],
         [ 1.2738,  1.3219,  0.6435,  ...,  2.1303, -1.2030, -0.4413]],

        [[ 1.9282,  1.4803,  0.9018,  ...,  1.0649,  0.8136,  1.6325],
         [ 1.9282,  1

In [14]:
max_steps = 20000
batch_size = 32
lossi = []
ud = []
lr = 0.001
# optimizer = torch.optim.Adam([p['param'] for p in parameters], lr=0.001)  
optimizer = Adam([p for p in parameters], lr=lr)

for i in range(max_steps):
    ix = torch.randint(0, Xtrain.shape[0], (batch_size, ))
    Xb, Yb = Xtrain[ix], Ytrain[ix]

    # forward pass:
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb)
    
    for p in parameters:
        p.grad = None
    loss.backward()

    # adam optimizer:
    optimizer.step()

    # stochastic gradient descent:
    # lr = 0.1 if i < 100000 else 0.01
    # for p in parameters:
    #     p.data -= lr * p.grad
    if i % 100 == 0:
        print(f"{i:7d}/{max_steps:7d}: {loss.item():.4f}")
    lossi.append(loss.log10().item())
    with torch.no_grad():
        ud.append([(lr * p.grad.std() / p.data.std()).log10().item() for p in parameters])
    


      0/  20000: 3.4947
    100/  20000: 2.5430
    200/  20000: 2.4220


KeyboardInterrupt: 

In [30]:

logits.dtype, Yb.dtype

(torch.float32, torch.int64)

In [18]:
Xb.shape, Yb.shape
for x, y in zip(Xb, Yb):
    print(x.tolist(), '-->', y.item())

[0, 0, 0, 0, 11, 25, 18, 5] --> 5
[0, 0, 0, 0, 5, 13, 13, 1] --> 12
[0, 0, 22, 9, 14, 9, 3, 9] --> 21
[0, 0, 0, 0, 0, 0, 22, 1] --> 14
[0, 0, 0, 19, 25, 11, 19, 20] --> 21
[0, 0, 0, 0, 0, 0, 11, 8] --> 5
[0, 0, 0, 0, 0, 19, 23, 1] --> 18
[0, 0, 0, 9, 26, 15, 12, 4] --> 1
[0, 0, 0, 0, 0, 26, 15, 18] --> 9
[0, 0, 0, 0, 0, 11, 5, 11] --> 15
[0, 0, 0, 0, 0, 0, 0, 4] --> 1
[0, 0, 0, 0, 0, 0, 0, 0] --> 19
[0, 0, 0, 0, 11, 1, 13, 9] --> 12
[0, 0, 0, 0, 26, 1, 13, 1] --> 14
[0, 0, 0, 0, 0, 0, 0, 5] --> 22
[0, 0, 0, 0, 0, 0, 0, 0] --> 10
[0, 8, 1, 16, 16, 9, 14, 5] --> 19
[0, 0, 0, 0, 0, 0, 0, 25] --> 1
[0, 0, 0, 0, 0, 13, 5, 12] --> 18
[0, 0, 0, 0, 0, 0, 0, 1] --> 5
[0, 0, 0, 0, 0, 10, 1, 13] --> 5
[0, 0, 0, 0, 0, 0, 1, 4] --> 22
[0, 0, 0, 0, 13, 15, 14, 20] --> 18
[0, 0, 0, 6, 1, 12, 3, 15] --> 14
[0, 0, 0, 0, 0, 19, 1, 9] --> 12
[0, 0, 0, 0, 0, 0, 0, 0] --> 20
[0, 0, 0, 19, 15, 16, 8, 9] --> 1
[0, 0, 0, 0, 12, 21, 4, 15] --> 19
[0, 0, 0, 0, 0, 25, 15, 19] --> 8
[0, 0, 0, 0, 0, 0, 0, 0] --> 1

In [11]:
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1))

In [12]:
for layer in model.layers:
    layer.training = False

In [13]:
# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
  x,y = {
    'train': (Xtrain, Ytrain),
    'val': (Xdev, Ydev),
    'test': (Xtest, Ytest),
  }[split]
  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

In [14]:
# sample from the model
for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      # forward pass the neural net
      logits = model(torch.tensor([context]))
      probs = F.softmax(logits, dim=1)
      # sample from the distribution
      ix = torch.multinomial(probs, num_samples=1).item()
      # shift the context window and track the samples
      context = context[1:] + [ix]
      out.append(ix)
      # if we sample the special '.' token, break
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out[:-1])) # decode and print the generated word

In [34]:
def one_hot_encode(sequence, vocab_size=256):
    one_hot = torch.zeros(sequence.size(0), vocab_size)
    one_hot.scatter_(1, sequence.unsqueeze(1), 1)
    return one_hot

# Example sequence
sequence = torch.tensor([131, 128, 129, 131, 131, 130, 131, 132, 121, 121, 122, 125, 126, 125, 124, 126])

# One-hot encode the sequence
one_hot_sequence = one_hot_encode(sequence)
print(one_hot_sequence) 

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
