In [95]:
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set_theme(style="dark")
%matplotlib inline
''' %matplotlib inline sets the backend of matplotlib to
the 'inline' backend. When using the 'inline' backend,
your matplotlib graphs will be included in your notebook,
next to the code.'''

# # for creating a responsive plot
# %matplotlib ipympl
# %matplotlib widget

import torch
import torch.nn.functional as F
torch.manual_seed(1337)
'''
To get same results when sampling during different runs.
If you are using cuDNN, you should set the deterministic behavior.
This might make your code quite slow, but might be a good method to check your code and deactivate it later.
'''
torch.backends.cudnn.deterministic = True
DEVICE = torch.device('cpu')
DEVICE

device(type='cpu')

## Dataset & plot helpers

In [96]:

words = open('names.txt', 'r').read().splitlines()

In [97]:
# sets takes an input, create a set of all items,
# & doesn't allow duplicates :)
# then we want a sorted list of course, the order matters! 
chars = sorted(list(set(''.join(words))))
# a map or dict:
# start from 1
s_to_i = { s:i for i, s in enumerate(chars, start=1)}
s_to_i['.'] = 0
i_to_s = { i:s for s,i in s_to_i.items()}

In [98]:
import itertools
def plot_heatmap(tensor, text=True, nrow=None, ncol=None, fig_size=(10,10), cmap='Blues', textc='gray'):
    if (nrow is None) or (ncol is None):
        nrow = tensor.shape[0]
        ncol = tensor.shape[1]
    plt.figure(figsize=fig_size)
    plt.imshow(tensor.detach().numpy(), cmap= cmap)
    # manually write text on each cell (seaborn annot doesn't look good)
    if text:
        for i, j in itertools.product(range(nrow), range(ncol)):
            # x:col, y:rows, the origin is top left corner, makes bottom <->top
            plt.text(x=j, y=i, s=f'{tensor[i,j].item():.2f}', ha='center', va='center', color=textc)
    plt.axis('off')

In [99]:
def plot_2d_emb(emb_lkt, nclass, figsize=(8,8)):
    plt.figure(figsize=figsize)
    plt.scatter(x= emb_lkt[:,0].data, y=emb_lkt[:,1].data, s=200)
    for i in range(nclass):
        plt.text(x=emb_lkt[i,0].item(), y=emb_lkt[i,1].item(), s=i_to_s[i], ha='center', va='center', color='white')
    plt.grid('minor')

# def plot_3d_emb(emb_lkt, nclass, figsize=(8,8)):
#     tensor = emb_lkt.data.detach().numpy()
#     fig = plt.figure(figsize=figsize)
#     ax = Axes3D(fig)
#     ax.scatter(xs= tensor[:,0], ys=tensor[:,1], zs=tensor[:,2], s=200)
#     for i in range(nclass):
#         ax.text(x=tensor[i,0], y=tensor[i,1],z=tensor[i,2], s=i_to_s[i], ha='center', va='center', color='white')
#     # displaying the plot
#     plt.grid('minor')
#     plt.show()

In [100]:
WINDOW_SIZE = 5
NCLASS = 27

def build_dataset(words:list, type:str):
    xs, ys = [], []

    # context input window: how many characters do we take to predict the next one?
    

    for w in words:
        context_window = [0]*WINDOW_SIZE
        for ch in f'{w}.':
            idx_y = s_to_i[ch]
            ys.append(idx_y)

            xs.append(context_window) # input : ch1
            # print(''.join(i_to_s[i] for i in context_window) + f' ---> {ch}' )

            # shift_to_left context window and append the idx_y
            context_window = context_window[1:] + [idx_y]
    xs = torch.tensor(xs, device= DEVICE) # device= 'cuda' 
    # ys = torch.Tensor(ys)
    ys = torch.tensor(ys, device= DEVICE)
    print(f'{type}: {xs.shape = }')
    return xs, ys


import random
random.seed(42)
random.shuffle(words)
n80 = int( 0.8*len(words) )
n90 = int( 0.9*len(words) )
Xtr, Ytr = build_dataset(words[:n80], 'Tr')
Xdev, Ydev = build_dataset(words[n80:n90], 'Dev')
Xts, Yts = build_dataset(words[n90:], 'Ts')
TR_SIZE = Xtr.shape[0]
DEV_SIZE = Xdev.shape[0]
TS_SIZE = Xts.shape[0]
print(f'{TR_SIZE = }')
print(f'{WINDOW_SIZE = }')
print(f'{NCLASS = }')

Tr: xs.shape = torch.Size([182625, 5])
Dev: xs.shape = torch.Size([22655, 5])
Ts: xs.shape = torch.Size([22866, 5])
TR_SIZE = 182625
WINDOW_SIZE = 5
NCLASS = 27


# Implementing Backpropagation manually for intuitive understanding

## 1. Linear Layer

In [101]:
class Linear():
    def __init__(self, fan_in, fan_out, bias=True) -> None:
        # initialization
        # self.w_gain = w_gain
        self.W = torch.randn(fan_in, fan_out, device= DEVICE)*0.01 # / fan_in**0.5
        self.b = torch.randn(1, fan_out, device= DEVICE)*0.01 if bias else None
    
    # Forward
    def __call__(self, x):
        self.x = x
        self.out = self.x @ self.W
        if self.b is not None:
            self.out += self.b
        return self.out
    
    @torch.no_grad()
    def backward(self, d_out):
        d_in = (d_out @ self.W.T) # BATCH_SIZE , fan_in
        d_W = (self.x.T @ d_out) # fan_in , fan_out
        d_b = d_out.sum(dim=0, keepdim=True) # 1, fan_out
        return d_in, d_W, d_b
    
    def parameters(self):
        return [self.W] + ( [] if self.b is None else [self.b] )

## 2. BatchNorm Layer

In [102]:
class BatchNorm1d():
    def __init__(self, fan_out, eps=1e-5, momentum=0.1, training=True) -> None:
        self.eps = eps
        self.momentum = momentum

        # flexible Gaussian preact
        self.bn_gain = torch.ones(fan_out, device= DEVICE)
        self.bn_bias = torch.zeros(fan_out, device= DEVICE)

        self.training = training

        # Buffers (in PyTorch nomenclature)
        # ema of mean and std
        self.bn_mean_ema = torch.zeros(fan_out, device= DEVICE)
        self.bn_var_ema = torch.ones(fan_out, device= DEVICE)
    
    # Forward
    def __call__(self, x, eps=1e-5):
        self.batch_size = x.shape[0]
        if self.training:
            self.bn_mean = x.mean(dim=0, keepdim=True) # 1, HLAYER_SIZE
            self.bn_var = x.var(dim=0, keepdim=True, unbiased=True)
            self.bn_std_inv = (self.bn_var + eps)**-0.5
            self.bn_raw = (x - self.bn_mean) * self.bn_std_inv
            self.out = self.bn_bias + self.bn_gain * self.bn_raw

            # update moving stats
            with torch.no_grad():
                self.bn_mean_ema = (1 - self.momentum) * self.bn_mean_ema + self.momentum * self.bn_mean
                self.bn_var_ema = (1 - self.momentum) * self.bn_var_ema + self.momentum * self.bn_var
        else:
            with torch.no_grad():
                self.out = self.bn_bias + self.bn_gain * (x - self.bn_mean_ema) * (self.bn_var_ema + eps)**-0.5
        return self.out
    
    @torch.no_grad()
    def backward(self, d_out):
        d_bn_bias = d_out.sum(dim=0, keepdim=True)
        d_bn_gain = (d_out * self.bn_raw).sum(dim=0, keepdim=True)
        d_bn_raw = d_out * self.bn_gain
        d_hprebn = (1/self.batch_size) * self.bn_std_inv * ( self.batch_size * d_bn_raw - d_bn_raw.sum(dim=0, keepdim=True) - self.bn_raw * (self.batch_size/(self.batch_size-1)) * (d_bn_raw*self.bn_raw).sum(dim=0, keepdim=True) )
        return d_hprebn, d_bn_raw, d_bn_gain, d_bn_bias
    
    def parameters(self):
        return [self.bn_gain, self.bn_bias]

## 3. Loss

In [103]:
class MyCrossEntropy():
    def __init__(self) -> None:
        self.qs = 0.0
    
    def __call__(self, logits, Yb):
        self.Yb = Yb
        self.logits = logits
        logits_max = logits.max(dim=1, keepdim=True).values # [BATCH_SIZE, 1]
        batch_size = logits_max.shape[0]
        # subtract max for numerical stability,
        # has no effect on qs bc of normalization in counts,
        # expect logits_max to have zero gradients as well
        norm_logits = logits - logits_max
        #[BATCH_SIZE, NCLASS]

        counts = norm_logits.exp() # [BATCH_SIZE, NCLASS]
        counts_sum = counts.sum(dim=1, keepdim=True) # [BATCH_SIZE, 1]

        # for division use **-1 instead of "/", PyTorch backward pass seems to give real numbers for the later
        counts_sum_inv = counts_sum**-1 # [BATCH_SIZE, 1]

        # q: softmax or model prediction distribution
        # p: true empirical distribution -> p(correct label) = 1 o.w. p=0
        self.qs = counts * counts_sum_inv # [BATCH_SIZE, NCLASS]
        Nlog_qs = -self.qs.log() # [BATCH_SIZE, NCLASS]
        # [BATCH_SIZE, 1]
        # Correct class
        cc_Nlog_qs = Nlog_qs[torch.arange(batch_size), self.Yb]
        # Expected cc_Nlog_qs
        return cc_Nlog_qs.mean()
    
    @torch.no_grad()
    def backward(self):
        # Manual Backprop
        d_logits = self.qs
        batch_size = self.logits.shape[0]
        d_logits[torch.arange(batch_size), self.Yb] -= 1.0
        ''' non-shortcutted gradients:
        gradient of batch mean : replication of 1/BATCH_SIZE
        '''
        # mean in the forward pass -> replication in the backward
        d_logits /= batch_size
        return d_logits


## a simple model with 1 hidden layer

In [104]:
EMB_DIM = 6
HLAYER_SIZE = 200

emb_lkt = torch.randn( NCLASS, EMB_DIM, device= DEVICE)

lin1 = Linear(fan_in=WINDOW_SIZE*EMB_DIM, fan_out=NCLASS, bias=True)
# layers = [lin1]
cross_entropy_loss = MyCrossEntropy()

parameters = [emb_lkt] + lin1.parameters()
# print(lin1.parameters())
for p in parameters:
    p.requires_grad = True
    p.grad = None
sum(p.nelement() for p in parameters)

999

In [105]:

BATCH_SIZE = 32
NSTEPS = 100000

# we don't need log loss anymore bc we don't have
# that hockey stick shape anymore
lossi = []
stepi = []

# just to prevent division by zero, in case the bn_std = 0
DIVISION0 = 1e-5
# momentum for moving average
# the lower the BATCH_SIZE the lower the MOMENTUM!!
MOMENTUM = 0.001
# with torch.no_grad():
for iter in range(NSTEPS):
    
    batch_upd_param_ratio = []
    
    # Forward path

    # cinstruct a mini batch 
    mini_batch_idx = torch.randint(low=0, high=TR_SIZE, size=(BATCH_SIZE,), device= DEVICE)
    Xb = Xtr[mini_batch_idx]
    Yb = Ytr[mini_batch_idx]
    # transform it to embeddings
    batch_emb = emb_lkt[Xb] # BATCH_SIZE, WINDOW_SIZE, emb_dim
    x = batch_emb.view(BATCH_SIZE, WINDOW_SIZE*EMB_DIM)
    
    # Forward pass
    # for layer in layers:
    # x = 

    logits = lin1(x) # * log counts [BATCH_SIZE, NCLASS]

    # keep track of grads for DEBUGGING
    lin1.out.retain_grad() # AFTER DEDBUG: would take out retain_grad

    # print(logits)
    # batch_loss = cross_entropy_loss(logits=logits, Yb=Yb)
    batch_loss = F.cross_entropy(logits, Yb)
    # print(batch_loss)

    lossi.append(batch_loss.log10().item())
    stepi.append(iter)

    # PyTorch backward pass
    for p in parameters:
        p.grad = None

    batch_loss.backward()
    
    with torch.no_grad():
        d_logits = cross_entropy_loss.backward()
        
        d_batch_emb_cat, d_W, d_b = lin1.backward(d_logits)

        d_batch_emb = d_batch_emb_cat.view(-1, WINDOW_SIZE, EMB_DIM )
        d_emb_lkt = torch.zeros_like(emb_lkt, device= DEVICE)
        # the gradient for those characters who were present in the batch
        # accumulate multiple occurrence
        for i,j in itertools.product(range(BATCH_SIZE), range(WINDOW_SIZE)):
            idx = Xb[i,j] # idx: 0-26
            d_emb_lkt[idx] += d_batch_emb[i,j]
        

        grads = [d_emb_lkt, d_W, d_b]#, d_W2, d_b2, d_bn_gain , d_bn_bias]
        
        # update Manually
        lr = 0.1 if iter < 100000 else 0.01
        for p, grad in zip(parameters, grads):
            p.data -= lr * grad

    if (iter+1) % 10000 == 0:
        print(f'iteration: {iter:7d} / {NSTEPS:7d} | mini loss: {batch_loss.item():.4f}')
    
    break # intentionally added, AFTER DEBUG, would take out obviously to run full optimization


AttributeError: 'MyCrossEntropy' object has no attribute 'logits'

In [None]:
@torch.no_grad()
def cmp(var_name, d_t, t):
    # check if exactly equal
    ex = torch.all(d_t == t.grad).item()
    # bc of floating point arithmetic we might get a little bit different result
    app = torch.allclose(input=d_t, other=t.grad, atol=1e-5, rtol=1e-8)
    maxdiff = (d_t - t.grad).abs().max().item()
    print(f'{var_name:15s} | exact: {str(ex):5s} | appoximate: {str(app):5s} | maxx diff : {maxdiff}')

In [None]:
with torch.no_grad():
	print(f'{logits.shape=}')
	print(f'{d_logits.shape=}')
	cmp('logits', d_t=d_logits, t=logits)

In [None]:
with torch.no_grad():
	print(f'{emb_lkt.shape=}')
	print(f'{d_emb_lkt.shape=}')
	cmp('emb_lkt', d_t=d_emb_lkt, t=emb_lkt)

In [53]:
@torch.no_grad()
def evaluate_loss(ds_type, with_batchN = False): # ds_type: dataset type
    X,Y = {
        'train': (Xtr, Ytr),
        'dev': (Xdev, Ydev),
        'test': (Xts, Yts),
    }[ds_type]

    emb = emb_lkt[X] # BATCH_SIZE, WINDOW_SIZE, emb_dim
    x = emb.view(-1, WINDOW_SIZE*EMB_DIM)
    
    for layer in layers:
        if isinstance(layer, BatchNorm1d):
            layer.training = False
    
    # Forward pass
    for layer in layers:
        x = layer(x)
    
    loss = cross_entropy_loss(x, Y)
    print(f'{ds_type} loss: {loss.item()}')

In [54]:
evaluate_loss('train')
evaluate_loss('dev')

train loss: 2.2888076305389404
dev loss: 2.2876508235931396


## Let's Generate some samples like Bigram

In [68]:

g = torch.Generator().manual_seed(2147483647, device= DEVICE)
for _ in range(10):
    idx_y = 0
    name = ''
    context_window = [0]*WINDOW_SIZE
    
    while True:
        # Forward path
        emb = emb_lkt[torch.tensor([context_window], device= DEVICE)] # BATCH_SIZE=1, WINDOW_SIZE, emb_dim
        
        hpreact = emb.view(-1, WINDOW_SIZE*EMB_DIM)@W1 #+ b1
        
        hpreact = bn_bias + bn_gain * (hpreact - bn_mean_ema) / bn_std_ema

        h = torch.tanh(hpreact)

        logits = h @ W2 + b2 # * log counts [BATCH_SIZE, NCLASS]
        probs = F.softmax(logits, dim=1)
        
        idx_y = torch.multinomial(probs.detach(), num_samples=1, replacement=True, generator=g, device= DEVICE).item()
        # shift_to_left context window and append the idx_y
        context_window = context_window[1:] + [idx_y]
        if idx_y == 0:
            break
        ch = i_to_s[idx_y]
        # print(ch)
        name += ch
    print(name)

mina
tyanne
vistyann
jonila
raha
maryelyani
ardan
kamariana
gilline
jaquantreo
