In [1]:
import pip
try:
  __import__("lightning")
except ImportError:
  pip.main(['install', "lightning"])  

import torch 
import torch.nn as nn ## torch.nn gives us nn.Module(), nn.Embedding() and nn.Linear()
import torch.nn.functional as F # This gives us the softmax() and argmax()
from torch.optim import Adam ## We will use the Adam optimizer, which is, essentially, a slightly less stochastic version of stochastic gradient descent.
from torch.utils.data import TensorDataset, DataLoader ## We'll store our data in DataLoaders

import lightning as L ## Lightning makes it easier to write, optimize and scale our code

<h2>Creating Datasets and it's Labels</h2>

'''  
- Q: "Who is the King of Cricket?" → A: "ViratKohli"
- Q: "Who is the BigShow of Cricket?" → A: "MaxWell"
- Q: "Who is the best test Captain in India?" → A: "ViratKohli"
- Q: "Who is the Universe Boss?" → A: "Chris Gayle"
- Q: "Who is the Alien in the Cricket Field?" → A: "ABdeVilliers"
- Q: "Who is the 360 degree player in the Cricket ?" → A: "ABdeVilliers"

'''

In [2]:
''' Creating the vocabulary and mapping that to an token id'''
''' This is done because the nn.Embedding() takes tokein id as the input'''

token_to_id = {
    'who': 0,
    'is': 1,
    'the': 2,
    'king': 3,
    'of': 4,
    'cricket': 5,
    'bigshow': 6,
    'best': 7,
    'test': 8,
    'captain': 9,
    'in': 10,
    'india': 11,
    'universe': 12,
    'boss': 13,
    'alien': 14,
    'field': 15,
    '360': 16,
    'degree': 17,
    'player': 18,
    'viratkohli': 19,
    'maxwell': 20,
    'chris': 21,
    'gayle': 22,
    'abdevilliers': 23,
    '<EOS>': 24  # End of sequence token
}


In [3]:
''' Make the id as the key and token as the value '''

id_to_token = dict(map(reversed,token_to_id.items()))


In [4]:
id_to_token

{0: 'who',
 1: 'is',
 2: 'the',
 3: 'king',
 4: 'of',
 5: 'cricket',
 6: 'bigshow',
 7: 'best',
 8: 'test',
 9: 'captain',
 10: 'in',
 11: 'india',
 12: 'universe',
 13: 'boss',
 14: 'alien',
 15: 'field',
 16: '360',
 17: 'degree',
 18: 'player',
 19: 'viratkohli',
 20: 'maxwell',
 21: 'chris',
 22: 'gayle',
 23: 'abdevilliers',
 24: '<EOS>'}

In [5]:

''' We are creating the dataset for the input using the token to id'''

# Define padding token
PAD_TOKEN = -1  # or you can add it to your token_to_id dictionary

# Create input sequences with padding
inputs = torch.tensor([
    # Q1: Who is the King of Cricket? (pad to length 11)
    [token_to_id["who"], 
     token_to_id["is"], 
     token_to_id["the"], 
     token_to_id["king"], 
     token_to_id["of"], 
     token_to_id["cricket"], 
     token_to_id["<EOS>"], 
     token_to_id["viratkohli"],
     PAD_TOKEN, PAD_TOKEN, PAD_TOKEN],
    
    # Q2: Who is the BigShow of Cricket? (pad to length 11)
    [token_to_id["who"], token_to_id["is"], token_to_id["the"], token_to_id["bigshow"], 
     token_to_id["of"], token_to_id["cricket"], token_to_id["<EOS>"], token_to_id["maxwell"],
     PAD_TOKEN, PAD_TOKEN, PAD_TOKEN],
    
    # Q3: Who is the best test Captain in India? (pad to length 11)
    [token_to_id["who"], token_to_id["is"], token_to_id["the"], token_to_id["best"], 
     token_to_id["test"], token_to_id["captain"], token_to_id["in"], token_to_id["india"],
     token_to_id["<EOS>"], token_to_id["viratkohli"], PAD_TOKEN],
    
    # Q4: Who is the Universe Boss? (pad to length 11)
    [token_to_id["who"], token_to_id["is"], token_to_id["the"], token_to_id["universe"], 
     token_to_id["boss"], token_to_id["<EOS>"], token_to_id["chris"], token_to_id["gayle"],
     PAD_TOKEN, PAD_TOKEN, PAD_TOKEN],
    
    # Q5: Who is the Alien in the Cricket Field? (pad to length 11)
    [token_to_id["who"], token_to_id["is"], token_to_id["the"], token_to_id["alien"], 
     token_to_id["in"], token_to_id["the"], token_to_id["cricket"], token_to_id["field"],
     token_to_id["<EOS>"], token_to_id["abdevilliers"], PAD_TOKEN],
    
    # Q6: Who is the 360 degree player in the Cricket? (already length 11)
    [token_to_id["who"], token_to_id["is"], token_to_id["the"], token_to_id["360"], 
     token_to_id["degree"], token_to_id["player"], token_to_id["in"], token_to_id["the"],
     token_to_id["cricket"], token_to_id["<EOS>"], token_to_id["abdevilliers"]]
])


labels = torch.tensor([
    # A1: ViratKohli
    [token_to_id["is"], token_to_id["the"], token_to_id["king"], token_to_id["of"], 
     token_to_id["cricket"], token_to_id["<EOS>"], token_to_id["viratkohli"], token_to_id["<EOS>"],
     PAD_TOKEN, PAD_TOKEN, PAD_TOKEN],
    
    # A2: Maxwell
    [token_to_id["is"], token_to_id["the"], token_to_id["bigshow"], token_to_id["of"],
     token_to_id["cricket"], token_to_id["<EOS>"], token_to_id["maxwell"], token_to_id["<EOS>"],
     PAD_TOKEN, PAD_TOKEN, PAD_TOKEN],
    
    # A3: ViratKohli
    [token_to_id["is"], token_to_id["the"], token_to_id["best"], token_to_id["test"],
     token_to_id["captain"], token_to_id["in"], token_to_id["india"], token_to_id["<EOS>"],
     token_to_id["viratkohli"], token_to_id["<EOS>"], PAD_TOKEN],
    
    # A4: Chris Gayle
    [token_to_id["is"], token_to_id["the"], token_to_id["universe"], token_to_id["boss"],
     token_to_id["<EOS>"], token_to_id["chris"], token_to_id["gayle"], token_to_id["<EOS>"],
     PAD_TOKEN, PAD_TOKEN, PAD_TOKEN],
    
    # A5: ABdeVilliers
    [token_to_id["is"], token_to_id["the"], token_to_id["alien"], token_to_id["in"],
     token_to_id["cricket"], token_to_id["field"], token_to_id["<EOS>"], token_to_id["abdevilliers"],
     token_to_id["<EOS>"], PAD_TOKEN, PAD_TOKEN],
    
    # A6: ABdeVilliers
    [token_to_id["is"], token_to_id["the"], token_to_id["360"], token_to_id["degree"],
     token_to_id["player"], token_to_id["in"], token_to_id["cricket"], token_to_id["<EOS>"],
     token_to_id["abdevilliers"], token_to_id["<EOS>"], PAD_TOKEN]
])


In [6]:
dataset = TensorDataset(inputs,labels)
dataloader = DataLoader(dataset) # if we are going to handle the large number of datasets 

<h2>Positional Encoding</h2>

In [7]:
''' Positional Encoding to keep track of the order of the tokens/words

We use sine and cosine functions for positional encoding, where the number of dimensions matches our embedding dimension.
For each position, we generate a unique pattern using these trigonometric functions.

For a d_model dimensional embedding vector at position pos:
- Even indices (2i): PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
- Odd indices (2i+1): PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

This creates a unique encoding for each position that:
1. Is deterministic and requires no training
2. Can handle variable sequence lengths
3. Has consistent relative distances between positions
4. Allows the model to easily attend to relative positions'''

'''nn.Module in PyTorch is a powerful base class that provides the fundamental building blocks for creating neural networks.It also provides Built-in Parameter management'''

'nn.Module in PyTorch is a powerful base class that provides the fundamental building blocks for creating neural networks.It also provides Built-in Parameter management'

In [8]:
position = torch.arange(start=0, end=11, step=1).float().unsqueeze(0)
position

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]])

In [9]:
class Positional_Encoding(nn.Module):

    def __init__(self, d_model = 2 , max_len=11):

        '''
        Since we are using 2-dimensional word embeddings (using sine and cosine waves) to represent each word/token, the d_model (model dimension) will be 2. This means each token in our transformer will be represented by a 2-dimensional vector in the embedding space. This d_model value will remain consistent throughout all layers of the transformer architecture, including attention mechanisms and feed-forward networks.
        max_len refers to the maximum length of the sentence , the maximum length of the sentence in our vocabulary is 11.
        '''
        super().__init__() # Intializing the weights for the neural network 

        pe = torch.zeros(max_len,d_model) # Creating the zeros matric for the positional encoding with repective to the transformer's dimensions and the max_len

        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1) # We are creating the positions for the positional encoding , we want float which will help in the training and unsqueeze helps to transform the horizontal list into a vertical list , nested list ex : if not unsqueeze it returns tensor([[0.,1.,2.]]) else tensor([[0.],[1.],[2.]])

        ## PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
        ## PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

        embedding_index = torch.arange(start=0, end=d_model, step=2).float()

        ''' Embedding values will be [0,2] which defines that 0 and 2nd index will the cos and 1 and 3rd index will be the sin  '''

        div_term = 1/torch.tensor(10000.0)**(embedding_index / d_model)

        pe[: , 0::2] = torch.sin(position * div_term)
        pe[: , 1::2] = torch.cos(position * div_term)

        ''' 
        pe =[[sin(0), cos(0), sin(0), cos(0)],
            [sin(1), cos(1), sin(1/10000^(2/d_model)), cos(1/10000^(2/d_model))],
            [sin(2), cos(2), sin(2/10000^(2/d_model)), cos(2/10000^(2/d_model))],
            [sin(3), cos(3), sin(3/10000^(2/d_model)), cos(3/10000^(2/d_model))],
            [sin(4), cos(4), sin(4/10000^(2/d_model)), cos(4/10000^(2/d_model))],
            [sin(5), cos(5), sin(5/10000^(2/d_model)), cos(5/10000^(2/d_model))],
            [sin(6), cos(6), sin(6/10000^(2/d_model)), cos(6/10000^(2/d_model))],
            [sin(7), cos(7), sin(7/10000^(2/d_model)), cos(7/10000^(2/d_model))],
            [sin(8), cos(8), sin(8/10000^(2/d_model)), cos(8/10000^(2/d_model))],
            [sin(9), cos(9), sin(9/10000^(2/d_model)), cos(9/10000^(2/d_model))],
            [sin(10), cos(10), sin(10/10000^(2/d_model)), cos(10/10000^(2/d_model))]]
        '''

    '''Addition of word embeddings with the positional encoding'''
    
    def forward(self,word_embeddings):

        return word_embeddings + self.pe[:word_embeddings.size(0), : ]


<h2> Attention </h2>

In [10]:

''' We are going to create the self attention and the masked self attention score'''
class Attention(nn.Module):

    def __init__(self,d_model = 2):

        super().__init__()

        #  Creating the query , key ,values for the word emebeddings 

        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = 0 
        self.col_dim = 1

    def forward(self,encoding_for_q,encoding_for_k,encoding_for_v,mask=None):

        q = self.W_q(encoding_for_q)
        k = self.W_k(encoding_for_k)
        v = self.W_v(encoding_for_v)


        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim)) # Creating the query key relationships
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5) # dividing the query key values with the square root of the dimension of the transformer

        if mask is not None:

            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)

        attention_percents = F.softmax(scaled_sims, dim=self.col_dim) # Creating the attention scores with the softmax
        attention_scores = torch.matmul(attention_percents, v) # Multiplying the attention scores with the values to get the attention scores

        return attention_scores

<h2>Decoder Only Transformer</h2>

In [None]:
class DecoderOnlyTransformer(L.LightningModule):
    
    def __init__(self, num_tokens, d_model=2, max_len=11, num_heads=3):
        super().__init__()
        
        L.seed_everything(seed=42)
        
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.we = nn.Embedding(num_embeddings=num_tokens, 
                             embedding_dim=d_model)     
        
        self.pe = Positional_Encoding(d_model=d_model, 
                                 max_len=max_len)

        # Create multiple attention heads
        self.attention_heads = nn.ModuleList([
            Attention(d_model=d_model) for _ in range(num_heads)
        ])
        
        # Add layer to reduce concatenated attention outputs back to d_model dimension
        self.reduce_attention_dim = nn.Linear(in_features=(num_heads*d_model), 
                                            out_features=d_model)

        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, token_ids):
        word_embeddings = self.we(token_ids)        
        position_encoded = self.pe(word_embeddings)
        
        # Create attention mask
        mask = torch.tril(torch.ones((token_ids.size(dim=0), token_ids.size(dim=0)), 
                                   device=self.device))
        mask = mask == 0
        
        # Calculate attention for each head
        attention_outputs = []
        for attention_head in self.attention_heads:
            attention_output = attention_head(position_encoded,
                                           position_encoded,
                                           position_encoded,
                                           mask=mask)
            attention_outputs.append(attention_output)
        
        # Concatenate all attention outputs
        all_attention_values = torch.cat(attention_outputs, dim=-1)
        
        # Reduce dimension back to d_model
        final_attention_values = self.reduce_attention_dim(all_attention_values)
        
        # Add residual connection
        residual_connection_values = position_encoded + final_attention_values
        
        fc_layer_output = self.fc_layer(residual_connection_values)
        
        return fc_layer_output
    
    def configure_optimizers(self): 
        return Adam(self.parameters(), lr=0.1)
    
    def training_step(self, batch, batch_idx): 
        input_tokens, labels = batch
        output = self.forward(input_tokens[0])
        loss = self.loss(output, labels[0])
        return loss
