In [1]:
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set_theme(style="dark")
%matplotlib inline
''' %matplotlib inline sets the backend of matplotlib to
the 'inline' backend. When using the 'inline' backend,
your matplotlib graphs will be included in your notebook,
next to the code.'''

# # for creating a responsive plot
# %matplotlib ipympl
# %matplotlib widget

import torch
import torch.nn.functional as F
'''
To get same results when sampling during different runs.
If you are using cuDNN, you should set the deterministic behavior.
This might make your code quite slow, but might be a good method to check your code and deactivate it later.
'''
torch.backends.cudnn.deterministic = True

In [2]:

words = open('names.txt', 'r').read().splitlines()

In [3]:
# sets takes an input, create a set of all items,
# & doesn't allow duplicates :)
# then we want a sorted list of course, the order matters! 
chars = sorted(list(set(''.join(words))))
# a map or dict:
# start from 1
s_to_i = { s:i for i, s in enumerate(chars, start=1)}
s_to_i['.'] = 0
i_to_s = { i:s for s,i in s_to_i.items()}

In [4]:
import itertools
def plot_heatmap(tensor, text=True, nrow=None, ncol=None, fig_size=(10,10), cmap='Blues', textc='gray'):
    if (nrow is None) or (ncol is None):
        nrow = tensor.shape[0]
        ncol = tensor.shape[1]
    plt.figure(figsize=fig_size)
    plt.imshow(tensor.detach().numpy(), cmap= cmap)
    # manually write text on each cell (seaborn annot doesn't look good)
    if text:
        for i, j in itertools.product(range(nrow), range(ncol)):
            # x:col, y:rows, the origin is top left corner, makes bottom <->top
            plt.text(x=j, y=i, s=f'{tensor[i,j].item():.2f}', ha='center', va='center', color=textc)
    plt.axis('off')

In [5]:
def plot_2d_emb(emb_lkt, nclass, figsize=(8,8)):
    plt.figure(figsize=figsize)
    plt.scatter(x= emb_lkt[:,0].data, y=emb_lkt[:,1].data, s=200)
    for i in range(nclass):
        plt.text(x=emb_lkt[i,0].item(), y=emb_lkt[i,1].item(), s=i_to_s[i], ha='center', va='center', color='white')
    plt.grid('minor')

# def plot_3d_emb(emb_lkt, nclass, figsize=(8,8)):
#     tensor = emb_lkt.data.detach().numpy()
#     fig = plt.figure(figsize=figsize)
#     ax = Axes3D(fig)
#     ax.scatter(xs= tensor[:,0], ys=tensor[:,1], zs=tensor[:,2], s=200)
#     for i in range(nclass):
#         ax.text(x=tensor[i,0], y=tensor[i,1],z=tensor[i,2], s=i_to_s[i], ha='center', va='center', color='white')
#     # displaying the plot
#     plt.grid('minor')
#     plt.show()

In [6]:
WINDOW_SIZE = 5
NCLASS = 27

def build_dataset(words:list, type:str):
    xs, ys = [], []

    # context input window: how many characters do we take to predict the next one?
    

    for w in words:
        context_window = [0]*WINDOW_SIZE
        for ch in f'{w}.':
            idx_y = s_to_i[ch]
            ys.append(idx_y)

            xs.append(context_window) # input : ch1
            # print(''.join(i_to_s[i] for i in context_window) + f' ---> {ch}' )

            # shift_to_left context window and append the idx_y
            context_window = context_window[1:] + [idx_y]
    xs = torch.tensor(xs)
    # ys = torch.Tensor(ys)
    ys = torch.tensor(ys)
    print(f'{type}: {xs.shape = } , {ys.shape = }')
    return xs, ys


import random
random.seed(42)
random.shuffle(words)
n80 = int( 0.8*len(words) )
n90 = int( 0.9*len(words) )
Xtr, Ytr = build_dataset(words[:n80], 'Tr')
Xdev, Ydev = build_dataset(words[n80:n90], 'Dev')
Xts, Yts = build_dataset(words[n90:], 'Ts')
TR_SIZE = Xtr.shape[0]
DEV_SIZE = Xdev.shape[0]
TS_SIZE = Xts.shape[0]
print(f'{TR_SIZE = }')
print(f'{WINDOW_SIZE = }')
print(f'{NCLASS = }')

Tr: xs.shape = torch.Size([182625, 5]) , ys.shape = torch.Size([182625])
Dev: xs.shape = torch.Size([22655, 5]) , ys.shape = torch.Size([22655])
Ts: xs.shape = torch.Size([22866, 5]) , ys.shape = torch.Size([22866])
TR_SIZE = 182625
WINDOW_SIZE = 5
NCLASS = 27


# Implementing Backpropagation manually for intuitive understanding

## a simple model with 1 hidden layer

In [15]:
EMB_DIM = 6
HLAYER_SIZE = 200

g = torch.Generator().manual_seed(2147483647)
emb_lkt = torch.randn( NCLASS, EMB_DIM, generator=g)

# Layer 1 : linear
W1 = torch.randn( WINDOW_SIZE*EMB_DIM, HLAYER_SIZE , generator=g) * 5 / (3*(WINDOW_SIZE*EMB_DIM)**0.5)
b1 = torch.randn( 1, HLAYER_SIZE, generator=g) * 0.1 # just for fun

# Layer 2 : Batch Norm
bn_gain = torch.ones(1, HLAYER_SIZE) * 0.1 + 1.0
bn_bias = torch.zeros(1, HLAYER_SIZE) * 0.1
# these are not model params, we update them recursively
bn_mean_ema = torch.zeros(1, HLAYER_SIZE)
bn_std_ema = torch.ones(1, HLAYER_SIZE)


# Layer 3 : Linear
W2 = torch.randn( HLAYER_SIZE, NCLASS, generator=g) * 0.1
b2 = torch.randn( 1, NCLASS, generator=g) * 0.1

parameters = [emb_lkt, W1, b1, W2, b2, bn_gain , bn_bias]
for p in parameters:
    p.requires_grad = True
    p.grad = None
sum(p.nelement() for p in parameters)

18189

In [65]:
lr = 0.1
BATCH_SIZE = 32
NSTEPS = 100000

# we don't need log loss anymore bc we don't have
# that hockey stick shape anymore
lossi = []
stepi = []

# just to prevent division by zero, in case the bn_std = 0
DIVISION0 = 1e-5
# momentum for moving average
# the lower the BATCH_SIZE the lower the MOMENTUM!!
MOMENTUM = 0.001
with torch.no_grad():
    for iter in range(NSTEPS):
            
        # Forward path

        # cinstruct a mini batch 
        mini_batch_idx = torch.randint(low=0, high=TR_SIZE, size=(BATCH_SIZE,))
        Xb = Xtr[mini_batch_idx] # [BATCH_SIZE, WINDOW_SIZE]
        Yb = Ytr[mini_batch_idx] # [BATCH_SIZE]
        # transform it to embeddings
        batch_emb = emb_lkt[Xb] # BATCH_SIZE, WINDOW_SIZE, emb_dim
        batch_emb_cat = batch_emb.view(BATCH_SIZE, WINDOW_SIZE*EMB_DIM)

        # layer 1 : linear
        hprebn = batch_emb_cat@W1 + b1

        # layer 2: batch norm
        bn_meani = hprebn.mean(dim=0, keepdim=True)
        # note: Bessel's correction for computing Variance
        # dividing by (BATCH_SIZE-1) instead of BATCH_SIZE
        bn_vari = hprebn.var(dim=0, keepdim=True, unbiased=True)
        bn_std_inv = (bn_vari + DIVISION0)**-0.5

        bn_raw = bn_std_inv * (hprebn - bn_meani)

        hpreact = bn_bias + bn_gain * bn_raw

        # pass the training set through
        
        bn_mean_ema = (1-MOMENTUM) * bn_mean_ema + MOMENTUM * bn_meani
        bn_std_ema = (1-MOMENTUM) * bn_std_ema + MOMENTUM * bn_std_inv**-1

        h = torch.tanh(hpreact) # BATCH_SIZE, HLAYER_SIZE

        logits = h @ W2 + b2 # * log counts [BATCH_SIZE, NCLASS]

        loss_mini = F.cross_entropy(logits, Yb)

        lossi.append(loss_mini.log10().item())
        stepi.append(i)

        # PyTorch backward pass
        for p in parameters:
            p.grad = None

        # loss_mini.backward()

        # Manual Backprop
        d_logits = F.softmax(logits, dim=1)
        d_logits[torch.arange(BATCH_SIZE), Yb] -= 1.0
        ''' unshortcutted gradients:
        gradient of batch mean : replication of 1/BATCH_SIZE
        '''
        # mean in the forward pass -> replication in the backward
        d_logits /= BATCH_SIZE

        d_h = d_logits @ W2.T # BATCH_SIZE , HLAYER_SIZE
        d_W2 = h.T @ d_logits # HLAYER_SIZE , NCLASS
        d_b2 = d_logits.sum(dim=0, keepdim=True)

        d_hpreact = (1.0 - h**2) * d_h

        d_bn_bias = d_hpreact.sum(dim=0, keepdim=True)
        d_bn_gain = (d_hpreact * bn_raw).sum(dim=0, keepdim=True)
        d_bn_raw = d_hpreact * bn_gain
        d_hprebn = (1/BATCH_SIZE) * bn_std_inv * ( BATCH_SIZE * d_bn_raw - d_bn_raw.sum(dim=0, keepdim=True) - bn_raw * (BATCH_SIZE/(BATCH_SIZE-1)) * (d_bn_raw*bn_raw).sum(dim=0, keepdim=True) )
        
        d_batch_emb_cat = d_hprebn @ W1.T # BATCH_SIZE , Window*emb
        d_W1 = batch_emb_cat.T @ d_hprebn # window*emb , HLAYER_SIZE
        # broadcast in forward -> vector sum in backward
        d_b1 = d_hprebn.sum(dim=0, keepdim=True)

        d_batch_emb = d_batch_emb_cat.view(-1, WINDOW_SIZE, EMB_DIM )
        d_emb_lkt = torch.zeros_like(emb_lkt)
        # the gradient for those characters who were present in the batch
        # accumulate multiple occurrence
        for i,j in itertools.product(range(BATCH_SIZE), range(WINDOW_SIZE)):
            idx = Xb[i,j] # idx: 0-26
            d_emb_lkt[idx] += d_batch_emb[i,j]
        

        grads = [d_emb_lkt, d_W1, d_b1, d_W2, d_b2, d_bn_gain , d_bn_bias]
        
        # update Manually
        for p, grad in zip(parameters, grads):
            p.data -= lr * grad

        if iter % 10000 == 0:
            print(f'iteration: {iter:7d} / {NSTEPS:7d} | mini loss: {loss_mini.item():.4f}')
        
        # break # intentionally added, AFTER DEBUG, would take out obviously to run full optimization


iteration:       0 /  100000 | mini loss: 4.1192
iteration:   10000 /  100000 | mini loss: 2.5748
iteration:   20000 /  100000 | mini loss: 2.3034
iteration:   30000 /  100000 | mini loss: 2.3385
iteration:   40000 /  100000 | mini loss: 2.2961
iteration:   50000 /  100000 | mini loss: 1.7824
iteration:   60000 /  100000 | mini loss: 2.0732
iteration:   70000 /  100000 | mini loss: 2.0931
iteration:   80000 /  100000 | mini loss: 2.5569
iteration:   90000 /  100000 | mini loss: 2.5152


In [66]:
@torch.no_grad()
def evaluate_loss(ds_type): # ds_type: dataset type
    X,Y = {
        'train': (Xtr, Ytr),
        'dev': (Xdev, Ydev),
        'test': (Xts, Yts),
    }[ds_type]

    emb = emb_lkt[X] # BATCH_SIZE, WINDOW_SIZE, emb_dim
    batch_emb_cat = emb.view(-1, WINDOW_SIZE*EMB_DIM)
    
    # layer 1 : linear
    hprebn = batch_emb_cat@W1 + b1

    # layer 2: batch norm
    hpreact = bn_bias + bn_gain * (hprebn - bn_mean_ema)/ bn_std_ema

    h = torch.tanh(hpreact) # BATCH_SIZE, HLAYER_SIZE

    logits = h @ W2 + b2 # * log counts [BATCH_SIZE, NCLASS]
    
    loss = F.cross_entropy(logits, Y)
    print(f'{ds_type} loss: {loss.item()}')

In [67]:
evaluate_loss('train')
evaluate_loss('dev')

train loss: 2.077568769454956
dev loss: 2.1396939754486084


## Let's Generate some samples like Bigram

In [68]:

g = torch.Generator().manual_seed(2147483647)
for _ in range(10):
    idx_y = 0
    name = ''
    context_window = [0]*WINDOW_SIZE
    
    while True:
        # Forward path
        emb = emb_lkt[torch.tensor([context_window])] # BATCH_SIZE=1, WINDOW_SIZE, emb_dim
        
        hpreact = emb.view(-1, WINDOW_SIZE*EMB_DIM)@W1 #+ b1
        
        hpreact = bn_bias + bn_gain * (hpreact - bn_mean_ema) / bn_std_ema

        h = torch.tanh(hpreact)

        logits = h @ W2 + b2 # * log counts [BATCH_SIZE, NCLASS]
        probs = F.softmax(logits, dim=1)
        
        idx_y = torch.multinomial(probs.detach(), num_samples=1, replacement=True, generator=g).item()
        # shift_to_left context window and append the idx_y
        context_window = context_window[1:] + [idx_y]
        if idx_y == 0:
            break
        ch = i_to_s[idx_y]
        # print(ch)
        name += ch
    print(name)

mina
tyanne
vistyann
jonila
raha
maryelyani
ardan
kamariana
gilline
jaquantreo
