In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
%matplotlib inline
# Check if GPU is available
output_device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


In [2]:
names = open('names.txt','r').read().splitlines()
names[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
chars = sorted(list(set(''.join(names))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(stoi)

In [4]:
# build dataset
block_size = 8 # context length: # of characters taken for prediction

def build_dataset(names):
    X, Y = [], []

    for name in names:
        context = [0] * block_size
        for ch in name + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

import random
random.seed(42)
random.shuffle(names)
n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))

Xtr, Ytr = build_dataset(names[:n1]) # training set 80%
Xdev, Ydev = build_dataset(names[n1:n2]) # dev set 10%
Xte, Yte = build_dataset(names[n2:]) # test set 10%


In [5]:
for x,y in zip(Xtr[:15], Ytr[:15]):
    print(''.join([itos[i.item()] for i in x]), '-->', itos[y.item()])

........ --> y
.......y --> u
......yu --> h
.....yuh --> e
....yuhe --> n
...yuhen --> g
..yuheng --> .
........ --> d
.......d --> i
......di --> o
.....dio --> n
....dion --> d
...diond --> r
..diondr --> e
.diondre --> .


In [6]:
# 'torch'ify the network syntax
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.name = 'Linear'
        self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
  
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

    
class BatchNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.name = 'BatchNorm1d'
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)
    
    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0,1)
            xmean = x.mean(dim, keepdim=True) # batch mean
            xvar =   x.var(dim, keepdim=True) # batch variance
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        # update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

# +---------------------Activation Functions-----------------------------------+

class Tanh:
    def __init__(self):
        self.name = 'Tanh'
        self.out = None
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

class Sigmoid:
    def __init__(self):
        self.name = 'Sigmoid'
        self.out = None
    def __call__(self, x):
        self.out = torch.sigmoid(x)
        return self.out
    def parameters(self):
        return []

class ReLU:
    def __init__(self):
        self.name = 'ReLU'
        self.out = None
    def __call__(self, x):
        self.out = torch.relu(x)
        return self.out
    def parameters(self):
        return []
    
#+---------------------------Utility Functions---------------------------------+
    
class Embedding:
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))

    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    
    def parameters(self):
        return [self.weight]

class FlattenConsecutive:
    def __init__(self, n):
        self.n = n

    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out
    
    def parameters(self):
        return []

class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [7]:
n_emb = 24 # dimension of character embedding vectors
n_hidden = 128 # # of neurons in hidden layer

model = Sequential([
    Embedding(vocab_size, n_emb),
    FlattenConsecutive(2), Linear(n_emb    * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# parameters initialization
# with torch.no_grad():
#   model[-1].weight *= 0.1

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
"""
    you can check quility of code improvement by running different archatecutre 
  iterations with the same number of parametes. for example, creating the 
  flatten consecutive layer increase the number of parameters by almost 8 fold
  so we decreaded the # of hidden neruons in each layer so it was the same total
  number of parameters as the previous model in order to see if the model improved 
  in its use of parameters
"""
for p in parameters:
  p.requires_grad = True

76579


In [8]:
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(f'Xb shape: {Xb.shape}')
Xb

Xb shape: torch.Size([4, 8])


tensor([[ 0,  0,  0,  0,  0,  0,  0, 10],
        [ 0,  0,  0,  0,  0, 13,  1, 18],
        [ 0,  0,  0,  0,  0,  0,  0, 12],
        [ 0,  0, 13, 15, 25, 19,  5, 19]])

In [9]:
print(f'Embedding output shape:   {model.layers[0].out.shape}') # shape of embedding output
print(f'Flatten output shape:     {model.layers[1].out.shape}') # shape of flatten output
print(f'Linear output shape:      {model.layers[2].out.shape}') # shape of linear output

Embedding output shape:   torch.Size([4, 8, 24])
Flatten output shape:     torch.Size([4, 4, 48])
Linear output shape:      torch.Size([4, 4, 128])


In [10]:
for layer in model.layers:
    print(f'{layer.__class__.__name__:18} --> {tuple(layer.out.shape)}')

Embedding          --> (4, 8, 24)
FlattenConsecutive --> (4, 4, 48)
Linear             --> (4, 4, 128)
BatchNorm1d        --> (4, 4, 128)
Tanh               --> (4, 4, 128)
FlattenConsecutive --> (4, 2, 256)
Linear             --> (4, 2, 128)
BatchNorm1d        --> (4, 2, 128)
Tanh               --> (4, 2, 128)
FlattenConsecutive --> (4, 256)
Linear             --> (4, 128)
BatchNorm1d        --> (4, 128)
Tanh               --> (4, 128)
Linear             --> (4, 27)


In [11]:
# optimization
max_steps = 200000
batch_size = 32
bn_eps = 1e-5 # epsilon for numerical stability in batch normalization (/0 error)
lossi = []
ud = []

for i in range(max_steps):
    # minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]

    #forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #update
    lr = 0.1 if i < 150000 else 0.01 # learning rate decay
    for p in parameters:
        p.data += -lr * p.grad 
    
    # track stats
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
    with torch.no_grad():
        ud.append([(lr*p.grad.std() / p.data.std()).log10().item() for p in parameters])
    if i >= 50000:
        break

      0/ 200000: 3.4415
  10000/ 200000: 1.7891
  20000/ 200000: 2.1357
  30000/ 200000: 1.7475
  40000/ 200000: 1.6545
  50000/ 200000: 2.1150


In [12]:
plt.plot(torch.tensor(lossi).view(-1,1000).mean(1))

RuntimeError: shape '[-1, 1000]' is invalid for input of size 50001

In [None]:
# forward pass activation distribution
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i, layer in enumerate(layers[:-1]): # note: exclude the output layer
  if isinstance(layer, Tanh):
    t = layer.out
    print('layer %d (%10s): mean %+.2f, std %.2f, saturated: %.2f%%' % (i, layer.__class__.__name__, t.mean(), t.std(), (t.abs() > 0.97).float().mean()*100))
    hy, hx = torch.histogram(t, density=True)
    plt.plot(hx[:-1].detach(), hy.detach())
    legends.append(f'layer {i} ({layer.__class__.__name__}')
plt.legend(legends);
plt.title('activation distribution')

In [None]:
# backward pass gradients distribution
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i, layer in enumerate(layers[:-1]): # note: exclude the output layer
  if isinstance(layer, Tanh):
    t = layer.out.grad
    print('layer %d (%10s): mean %+f, std %e' % (i, layer.__class__.__name__, t.mean(), t.std()))
    hy, hx = torch.histogram(t, density=True)
    plt.plot(hx[:-1].detach(), hy.detach())
    legends.append(f'layer {i} ({layer.__class__.__name__}')
plt.legend(legends);
plt.title('gradient distribution');

In [None]:
# weights updated w/ SGD distribution
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i,p in enumerate(parameters):
  t = p.grad
  if p.ndim == 2:
    print('weight %10s | mean %+f | std %e | grad:data ratio %e' % (tuple(p.shape), t.mean(), t.std(), t.std() / p.std()))
    hy, hx = torch.histogram(t, density=True)
    plt.plot(hx[:-1].detach(), hy.detach())
    legends.append(f'{i} {tuple(p.shape)}')
plt.legend(legends)
plt.title('weights gradient distribution');

In [None]:
# data update to data ratio
plt.figure(figsize=(20, 4))
legends = []
for i,p in enumerate(parameters):
  if p.ndim == 2:
    plt.plot([ud[j][i] for j in range(len(ud))])
    legends.append('param %d' % i)
plt.plot([0, len(ud)], [-3, -3], 'k') # these ratios should be ~1e-3, indicate on plot
plt.legend(legends);

In [None]:
# eval mode of layers
for layer in model.layers:
    layer.training = False

In [None]:
# loss evaluation
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.963224172592163
val 2.059967517852783


In [None]:
# model sampling
@torch.no_grad()
def sample_model(n):
    for _ in range(n):
        out = []
        context = [0] * block_size
        while True:
            # forward pass
            logits = model(torch.tensor([context]))
            probs = F.softmax(logits, dim=-1)
            # sampling
            ix = torch.multinomial(probs, num_samples=1).item()
            # shift context
            context = context[1:] + [ix]
            if ix == 0:
                break
            out.append(itos[ix])
        print(''.join(out))
sample_model(20)

trevin
kyriah
daysa
bryson
hillh
tropery
velonia
sileen
aishaa
kenun
hevericy
adriana
elysah
alle
jacklynn
kathani
tocelin
bamer
laeyah
elonni
