# Introduction
----
- this is a practise exercise to learn the workings of modern transformers used in LLMs
- I followed the Karpathy's lectures on YouTube and made some changes as needed (for better efficiency)

In [10]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(42)

<torch._C.Generator at 0x79216d384450>

In [2]:
from dataclasses import dataclass

@dataclass
class config:
    block_size = 8 ##context window
    batch_size = 4 

# Reading the input text

In [3]:
file_path = r'/kaggle/input/tiny-shakespeare-karpathys-repo/tiny_shakespeare.txt'
with open(file_path,'r') as f:
    text = f.read()
print(text[:100])
print('='*10)
print(f'Length of the dataset (all the characters):{len(text)}')

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
Length of the dataset (all the characters):1115394


## Creating a vocabulary from the text
- here karpathy used the set() constructor, which is the most efficient approach 

In [4]:
characters = sorted(set(text)) ##takes out all the unique characters and sorts it, returning a list
vocabulary_size = len(characters)

print(f'Size of vocabulary: {vocabulary_size}')
vocab = ''.join(characters)
print('Vocabulary:',vocab)

Size of vocabulary: 65
Vocabulary: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


## Tokenizing the text
- we are building a character level model, for which we'll first start with a lookup table for charcters -> index and inverse lookkup table for index -> characters
- 

In [36]:
ch2idx = {ch:idx for idx,ch in enumerate(characters)}
idx2ch = {idx:ch for idx,ch in enumerate(characters)}

def encode(text:str):
    return [ch2idx[ch] for ch in text]

def decode(ids:list):
    return ''.join([idx2ch[idx] for idx in ids])

example = 'My name is Harikesh'
print(
    f'Input: {example}',
    '\n',
    f'Len of Input: {len(example)}'
    '\n'
    f'Output(encoded): {encode(example)}'
    '\n'
    f'Length of Encoded output: {len(encode(example))}'
)

Input: My name is Harikesh 
 Len of Input: 19
Output(encoded): [25, 63, 1, 52, 39, 51, 43, 1, 47, 57, 1, 20, 39, 56, 47, 49, 43, 57, 46]
Length of Encoded output: 19


In [6]:
##creating the encoded representation of all the data in a tensor
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape)

torch.Size([1115394])


## Splitting the data in train and val
- here we can't use a random split, because the text follows a semantic order which has to be preserved if we want our model to learn to generate text like Shakespeare
- we'll use the first 90% data for training and rest 10% for validation

In [7]:
n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]

### Training Philosophy
- any chunk of data with size 'd' that is sampled from the train data has 'd' number of training examples for the model
- starting from the first character, we train the model to predict second then with the first two, the third and so on till the 'd-1'th element which is used to predict the d'th character
- here the choice to take context size as 'd' is freely available to the user, with higher values of d requiring more compute

In [8]:
##example
X = train_data[:config.block_size+1]
y = train_data[1:config.block_size+1]

for t in range(config.block_size):
    context = X[:1+t]
    label = y[t]
    print(f'for context: {context} -> {label}')

for context: tensor([18]) -> 47
for context: tensor([18, 47]) -> 56
for context: tensor([18, 47, 56]) -> 57
for context: tensor([18, 47, 56, 57]) -> 58
for context: tensor([18, 47, 56, 57, 58]) -> 1
for context: tensor([18, 47, 56, 57, 58,  1]) -> 15
for context: tensor([18, 47, 56, 57, 58,  1, 15]) -> 47
for context: tensor([18, 47, 56, 57, 58,  1, 15, 47]) -> 58


### A note on generating the batch
- karpathy's method randomly samples (batch_size,block_size) sample vectors from the input text
- this stocasticity means that we may or may not use the whole dataset in training, while it maintains the IID assumptions and will be very beneficials for preventing overfitting there are other better ways to do it (in production)

In [13]:
def get_batch(data,context_window=config.block_size,batch_size=config.batch_size):
    idx = torch.randint(len(data)-context_window,(batch_size,)) 
    X = torch.stack([data[i:i+context_window] for i in idx])
    y = torch.stack([data[i+1:i+context_window+1] for i in idx])
    return X,y

xb,yb = get_batch(train_data)

# BiGram Language Model
- this model effictively uses the statistical approach of calculating all the bigrams from the input text and arranging them in a table where rows and columns are the first and second character of the bigrams and the intersecting cell is the count of how many times the column character follows the row charater in the text
- in PyTorch we can implement this using the nn.Embedding() layer which createst a table of (vocab_size,vocab_size) assigning random numbers in the cells and upon training we hope that the values in these cells approach the count (normalized over the rows) in the statistical table approach

In [58]:
# %%writefile bigram.py
class Bigram(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.lookup_table = nn.Embedding(vocab_size,vocab_size)
    def forward(self,idx,labels=None):
        logits = self.lookup_table(idx) #(B,T,C) -> (4,8,65)

        if labels is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            labels = labels.view(B*T)
            loss = F.cross_entropy(logits,labels)
        return logits,loss
        
    def generate(self,idx,max_length):
        ##idx is (B,T) for batch and timesteps of the input context
        for _ in range(max_length-1):
            logits,loss = self(idx[:,-1])
            # logits = logits[:,:]
            probs = F.softmax(logits,dim=-1)
            idx_next = torch.multinomial(probs,num_samples=1)
            idx = torch.cat([idx,idx_next],dim=1)
        return idx

## Testing Untrained Bigram Model

In [59]:
##testing untrained bigram model
bigram_model = Bigram(vocab_size=vocabulary_size)
logits,loss = bigram_model(xb,yb)

input_context = torch.zeros((1,1),dtype=torch.long)
outputs = bigram_model.generate(input_context,100).squeeze(0).tolist()
print(len(outputs))
print(decode(outputs))

100

KrzUqdMcrAbXtaNe ?FVnPjSN!'EFXrzooiHkWWhSxlHNsqP
M33;rD?m;DYde,zj,OO.uILDPu$ctOrf,FXHCH I-dcxoMQD:s


## Training the BiGram Model

In [61]:
optimizer = torch.optim.AdamW(bigram_model.parameters(),lr=1e-3)

batch_size=8

for steps in range(10000):
    optimizer.step()
    xb,yb = get_batch(train_data)

    logits,loss = bigram_model(xb,yb)

    loss.backward()
    optimizer.step()

    if steps%1000 == 0:
        print(loss.item())

input_context = torch.zeros((1,1),dtype=torch.long)
outputs = bigram_model.generate(input_context,100).squeeze(0).tolist()
# print(len(outputs))
print(decode(outputs))

2.708622932434082
2.6894116401672363
2.176974058151245
2.289198160171509
4.675192832946777
2.432370662689209
2.631892442703247
2.386826753616333
2.491130828857422
2.5769729614257812

G BENTCLOMNCI linn, onofeviapearb prstrde cooned insue'me wak hrs dotor LAno merareathabed hepe ar 


# Understanding Attention Mechanism -  the **matrix multiplication trick**
- before writing any self-attention block we need to understand what attention does
- here we used a toy example to understand what attention does in practice

In [62]:
## a toy vector which represents the actual tensors in a language model 
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

- currently the T tokens in each batch are not communicating with each other, in the BiGram model only the current token was taken to look forward in the lookup table based on the probability
- but we need them to interact in a certain way, to establish this interaction we begin with the simplest form of interaction
- **Averaging all the tokens in channels till 't' where t <= T for each batch**

## 1. Using the naive loop

In [65]:
##code to do this operation also called BOW(bag of words)
xbow = torch.zeros((B,T,C))
for batch_idx in range(B):
    for time_step in range(T):
        xprev = x[batch_idx,:time_step+1]
        xbow[batch_idx,time_step] = torch.mean(xprev,0)

x[0],xbow[0]

(tensor([[ 0.6386,  1.3422],
         [ 2.2915,  0.4167],
         [ 0.7666,  0.4100],
         [ 0.0103, -0.8816],
         [ 1.3301, -1.0094],
         [-0.3699, -1.1534],
         [-0.3768,  0.2928],
         [ 0.1047, -0.6998]]),
 tensor([[ 0.6386,  1.3422],
         [ 1.4651,  0.8794],
         [ 1.2322,  0.7229],
         [ 0.9267,  0.3218],
         [ 1.0074,  0.0556],
         [ 0.7779, -0.1459],
         [ 0.6129, -0.0832],
         [ 0.5494, -0.1603]]))

## 2. Using the matrix multiplication trick with **tril** or **triu**

In [73]:
mask = torch.tril(torch.ones(3,3))
mask = mask/torch.sum(mask,1,keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = mask@b

print(f'mask=\n{mask}')
print('='*5)
print(f'a:\n{b}')
print('='*5)
print(f'mask @ a:\n{c}')

mask=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
=====
a:
tensor([[8., 2.],
        [0., 0.],
        [6., 3.]])
=====
mask @ a:
tensor([[8.0000, 2.0000],
        [4.0000, 1.0000],
        [4.6667, 1.6667]])


### Using this trick to create a similar matrix 
- here the mask (T,T) when matirx multiplied to x (B,T,C) by the rules of brodcasting mask is brodcasted to (B,T,T)
- and like the previous cell all the matrices in channels are averaged 

In [76]:
mask = torch.tril(torch.ones(T,T))
mask = mask/torch.sum(mask,1,keepdim=True)

c = mask@x
xbow[0],c[0]

(tensor([[ 0.6386,  1.3422],
         [ 1.4651,  0.8794],
         [ 1.2322,  0.7229],
         [ 0.9267,  0.3218],
         [ 1.0074,  0.0556],
         [ 0.7779, -0.1459],
         [ 0.6129, -0.0832],
         [ 0.5494, -0.1603]]),
 tensor([[ 0.6386,  1.3422],
         [ 1.4651,  0.8794],
         [ 1.2322,  0.7229],
         [ 0.9267,  0.3218],
         [ 1.0074,  0.0556],
         [ 0.7779, -0.1459],
         [ 0.6129, -0.0832],
         [ 0.5494, -0.1603]]))

## 3. Using the softmax and mask filling approach to make learnable parameters
- here we will make the 'wei' matrix change based on the data
- implementing that change in next step

In [83]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0,float('-inf'))

wei = F.softmax(wei,dim=1)

xbow3 = wei@x

torch.allclose(xbow,xbow3)

True

## 4. Defining how to create the wei matrix from data dependent parameters

In [85]:
head_size = 16
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias=False)
value = nn.Linear()

##defining wei matrix 

k = key(x) ## (B,T,C)*(C,h) -> (B,T,C)*(B,C,h) => (B,T,h)
q = query(x) ## (B,T,h)

wei = q@k.transpose(-2,-1) ## (B,T,h)*(B,T,h) -> (B,T,h)*(B,h,T) => (B,T,T)

wei = wei.masked_fill(tril==0,float('-inf'))
wei = F.softmax(wei,dim=1)

xbow_self_attention = wei@x
xbow_self_attention.shape

torch.Size([4, 8, 2])