In [1]:
# # We always start with a dataset to train on. Let's download the tiny shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

## Imports

In [2]:
import math
import itertools
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set_theme(style="dark")
%matplotlib inline
''' %matplotlib inline sets the backend of matplotlib to
the 'inline' backend. When using the 'inline' backend,
your matplotlib graphs will be included in your notebook,
next to the code.'''

# # for creating a responsive plot
# %matplotlib ipympl
# %matplotlib widget

import torch
import torch.nn as nn
import torch.nn.functional as F

DEVICE = 'cuda' # 'cuda' , 'cpu'
torch.set_default_tensor_type('torch.cuda.FloatTensor')

## Loading Data & Tokenizer

In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print(len(text))
print(text[:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
# Get the unique chars of the text by set()
chars = sorted(list(set(text)))
NCLASS = len(chars)
print(''.join(chars))
NCLASS


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


65

In [6]:
# encoder tokenizer
ch_to_i = { ch:i for i,ch in enumerate(chars) }
encode = lambda s: [ ch_to_i[ch] for ch in s ]

# decoder tokenizer
i_to_ch = { i:ch for ch,i in ch_to_i.items()}
decoder = lambda si: ''.join([i_to_ch[i] for i in si])

print(encode('hi there!'))
print( decoder(encode('hi there!')) )

[46, 47, 1, 58, 46, 43, 56, 43, 2]
hi there!


1. Google use Sentence Piece for tokenization.

SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing.

https://github.com/google/sentencepiece

2. tiktoken is a fast BPE tokeniser for use with OpenAI's models.

https://github.com/openai/tiktoken

In [7]:
enc_data = torch.tensor(encode(text), dtype=torch.long, device=DEVICE)
print(enc_data.shape, enc_data.dtype)
print(enc_data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59], device='cuda:0')


## Heatmap and Embedding Visualization

In [8]:
import itertools
def plot_heatmap(tensor, text=True, nrow=None, ncol=None, fig_size=(10,10)):
    if (nrow is None) or (ncol is None):
        nrow = tensor.shape[0]
        ncol = tensor.shape[1]
    plt.figure(figsize=fig_size)
    plt.imshow(tensor.detach().numpy(), cmap= 'Blues')
    # manually write text on each cell (seaborn annot doesn't look good)
    if text:
        for i, j in itertools.product(range(nrow), range(ncol)):
            # x:col, y:rows, the origin is top left corner, makes bottom <->top
            plt.text(x=j, y=i, s=f'{tensor[i,j].item():.2f}', ha='center', va='center', color='grey')
    plt.axis('off')

### 2D & 3d Embedding Visualization

In [9]:
def plot_2d_emb(emb_lkt, nclass, figsize=(8,8)):
    plt.figure(figsize=figsize)
    plt.scatter(x= emb_lkt[:,0].data, y=emb_lkt[:,1].data, s=200)
    for i in range(nclass):
        plt.text(x=emb_lkt[i,0].item(), y=emb_lkt[i,1].item(), s=i_to_ch[i], ha='center', va='center', color='white')
    plt.grid('minor')

# def plot_3d_emb(emb_lkt, nclass, figsize=(8,8)):
#     tensor = emb_lkt.data.detach().numpy()
#     fig = plt.figure(figsize=figsize)
#     ax = Axes3D(fig)
#     ax.scatter(xs= tensor[:,0], ys=tensor[:,1], zs=tensor[:,2], s=200)
#     for i in range(nclass):
#         ax.text(x=tensor[i,0], y=tensor[i,1],z=tensor[i,2], s=i_to_s[i], ha='center', va='center', color='white')
#     # displaying the plot
#     plt.grid('minor')
#     plt.show()

## Splitting dataset, prepare Context window

1. split rate 90%, 10%

2. Dev or Validation set is for hyper parameter tuning

In [10]:
n90 = int( 0.9*len(enc_data) )
train_data = enc_data[:n90]
val_data = enc_data[n90:]

we never feed all the text into the Transformer all at once, that would be computationally very expensive, and prohibitive.

We actually only work with chunks of text sampled from the dataset. we call it context and we have a context length.



In [11]:
CONTEXT_L = 8
train_data[:CONTEXT_L+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58], device='cuda:0')

In [12]:
# we want the model to predict what char comes after
# any number of chars from 1 to CONTEXT_L as input

x = train_data[:CONTEXT_L]
y = train_data[1:CONTEXT_L+1]
for t in range(CONTEXT_L):
    context = x[:t+1] # +1 is bc t starts from 0 and we would get empty window without it
    target = y[t]
    print(f'{context.tolist()} -> {target}')

[18] -> 47
[18, 47] -> 56
[18, 47, 56] -> 57
[18, 47, 56, 57] -> 58
[18, 47, 56, 57, 58] -> 1
[18, 47, 56, 57, 58, 1] -> 15
[18, 47, 56, 57, 58, 1, 15] -> 47
[18, 47, 56, 57, 58, 1, 15, 47] -> 58


## creating both Batch dimension and time (or context) dimensions

we have batch & time (in context window) dimensions

In [13]:
from helper_reproduciblility import set_all_seeds, set_deterministic

set_all_seeds(seed=1337)
set_deterministic()

In [14]:
BATCH_SIZE = 4 # how many independent sequence we process in parallel
CONTEXT_L = 8 # the max context length for input and output

def get_batch(stage:str):
    data = train_data if stage=='train' else val_data
    # a BATCH_SIZE number of int for context_window_starts
    # this random init for the context window throughout the dataset is good
    # as the tone and style of the text might change from the start to the end of text
    cw_starts = torch.randint(low=0, high=len(data)-CONTEXT_L , size=(BATCH_SIZE,) )
    x = torch.stack([ data[ cw_start : cw_start+CONTEXT_L ] for cw_start in cw_starts])
    # shift the window by for y
    y = torch.stack([ data[ cw_start+1 : cw_start+CONTEXT_L+1 ] for cw_start in cw_starts])
    return x, y

# create the batch of independent context windows
xb, yb = get_batch('train')
print(f'{xb=}')
print(f'{yb=}')

# from each independent context window,
# create (independent?) sequences with different size of characters
for b, t in itertools.product(range(BATCH_SIZE),range(CONTEXT_L)):
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f'if context={context.tolist()} -> target={target}')


'''
we should see 4 rows (independent batch dimension)
and 8 cols (Context window length)

then we take each row and create multiple sequence
with max size of CONTEXT_L
'''


xb=tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')
yb=tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')
if context=[24] -> target=43
if context=[24, 43] -> target=58
if context=[24, 43, 58] -> target=5
if context=[24, 43, 58, 5] -> target=57
if context=[24, 43, 58, 5, 57] -> target=1
if context=[24, 43, 58, 5, 57, 1] -> target=46
if context=[24, 43, 58, 5, 57, 1, 46] -> target=43
if context=[24, 43, 58, 5, 57, 1, 46, 43] -> target=39
if context=[44] -> target=53
if context=[44, 53] -> target=56
if context=[44, 53, 56] -> target=1
if context=[44, 53, 56, 1] -> target=58
if context=[44, 53, 56, 1, 58] -> target=46
if context=[44, 53, 56, 1, 58, 46] -> target=39
if context=[44, 53, 56, 1, 58, 46, 39] -> target=58
if con

'\nwe should see 4 rows (independent batch dimension)\nand 8 cols (Context window length)\n\nthen we take each row and create multiple sequence\nwith max size of CONTEXT_L\n'

## create Bigram as baseline! :)


In [15]:
class Bigram(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        # a look up table :
        # row : each token - col : logits of the next token (each token directly reads off the logit of the next token) 
        self.token_embedding_lkt = nn.Embedding(num_embeddings=NCLASS, embedding_dim=NCLASS, device=DEVICE)
    
    def forward(self, idx, target):
        # idx & target are both (batch_size, context_length)
        logits = self.token_embedding_lkt(idx) # (batch_size, context_length, n_class)
        loss = F.cross_entropy(input=logits.view(-1, NCLASS), target=target.view(-1))
        return logits, loss
    
    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        # idx : scalar
        int_seq = [idx]*max_new_tokens
        for i in range(max_new_tokens):
            logits = self.token_embedding_lkt(idx) # (n_class)
            probs = F.softmax(input=logits, dim=-1).view(-1)  # (n_class)
            idx = torch.multinomial(input=probs, num_samples=1, replacement=True)
            int_seq[i] = idx.item()
        return int_seq

In [16]:
model = Bigram()
logit , loss = model(xb, yb)
print(logit.shape, loss)

torch.Size([4, 8, 65]) tensor(4.8573, device='cuda:0', grad_fn=<NllLossBackward0>)


In [17]:
int_seq = model.generate(idx=torch.zeros(1, dtype=torch.long, device=DEVICE), max_new_tokens=10)
decoder(int_seq)

'lcRGUBgKwR'

## W&B

In [18]:
import wandb

BATCH_SIZE = 32 # how many independent sequence we process in parallel
CONTEXT_L = 8 # the max context length for input and output

LR = 0.01
NEPOCH = 1000

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="TinyGPT",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": LR,
    "architecture": "BigramEmbedding",
    "dataset": "Shakespeare",
    "epochs": NEPOCH,
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mopen_ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Optimizer Object

In [19]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

In [22]:
for _ in range(NEPOCH):
    xb, yb = get_batch(stage='train')
    
    logits, loss = model(xb, yb)
    #   log metrics to wandb
    wandb.log({"loss": loss})
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss)

tensor(2.4533, device='cuda:0', grad_fn=<NllLossBackward0>)


In [52]:
# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…