<a href="https://colab.research.google.com/github/Moiz269/Encoder-Transformer-Model/blob/main/CompleteTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ATTENTION

from math import sqrt
import torch
import torch.nn as nn
import torch.nn.functional as F


class MultiHeadAttention(nn.Module):

    def __init__(self, embed_dim=512, heads=8):

        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim  # 512 by default
        self.heads = heads  # 8 heads by default
        self.head = int(self.embed_dim / self.heads)  # 512 / 8 = 64 by default
        # note: The embedding dimension must be divided by the number of heads

        # query, value, key: (64x64)
        self.query = nn.Linear(self.head, self.head, bias=False)  # the Query metrix
        self.value = nn.Linear(self.head, self.head, bias=False)  # the Value metrix
        self.key = nn.Linear(self.head, self.head, bias=False)  # the Key metrix

        # fully connected layer: 8*64x512 or 512x512
        self.fc_out = nn.Linear(self.head * self.heads, embed_dim)

    def forward(self, key, query, value, mask=None):
        # Input of size: batch_size x sequence length x embedding dims
        batch_size = key.size(0)
        k_len, q_len, v_len = key.size(1), query.size(1), value.size(1)

        # reshape from (batch_size x seq_len x embed_size) -> (batch_size x seq_len x heads x head)
        # example: from (32x10x512) -> (32x10x8x64)
        key = key.reshape(batch_size, k_len, self.heads, self.head)
        query = query.reshape(batch_size, q_len, self.heads, self.head)
        value = value.reshape(batch_size, v_len, self.heads, self.head)

        key = self.key(key)  # (32x10x8x64)
        query = self.query(query)  # (32x10x8x64)
        value = self.value(value)  # (32x10x8x64)

        ############### query x key ###############

        # query shape: batch_size x q_len, heads, head, e.g: (32x10x8x64)
        # key shape: batch_size x v_len, heads, head, e.g: (32x10x8x64)
        # product shape should be: batch_size, heads, q_len, v_len, e.g: (32x8x10x10)
        product = torch.einsum("bqhd,bkhd->bhqk", [query, key])

        # if mask (in decoder)
        if mask is not None:
            product = product.masked_fill(mask == 0, float("-1e20"))

        product = product / sqrt(self.head)

        scores = F.softmax(product, dim=-1)

        ############### scores x value ###############

        # scores shape: batch_size, heads, q_len, v_len, e.g: (32x8x10x10)
        # value shape: batch_size, v_len, heads, head, e.g: (32x10x8x64)
        # output: batch_size, heads, v_len, head, e.g: (32x10x512)

        output = torch.einsum("nhql,nlhd->nqhd", [scores, value]).reshape(
            batch_size, q_len, self.heads * self.head
        )

        output = self.fc_out(output)  # (32x10x512) -> (32x10x512)

        return output


In [None]:
# EMBEDDINGS

from math import sin, cos, sqrt, log
import torch
import torch.nn as nn


class Embedding(nn.Module):

    def __init__(self, vocab_size, embed_dim):

        super(Embedding, self).__init__()
        self.embed_dim = embed_dim
        self.embed = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):

        output = self.embed(x) * sqrt(self.embed_dim)
        # print(f"Embedding shape: {output.shape}")
        return output


class PositionalEncoding(nn.Module):

    def __init__(self, embed_dim, max_seq_len=5000, dropout=0.1):

        super(PositionalEncoding, self).__init__()
        self.embed_dim = embed_dim
        self.dropout = nn.Dropout(dropout)

        positional_encoding = torch.zeros(max_seq_len, self.embed_dim)
        position = torch.arange(0, max_seq_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, embed_dim, 2) * -(log(10000.0) / embed_dim)
        )
        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        pe = positional_encoding.unsqueeze(0)

        # we use register_buffer to save the "pe" parameter to the state_dict
        self.register_buffer('pe', pe)

    def pe_sin(self, position, i):
        return sin(position / (10000 ** (2 * i) / self.embed_dim))

    def pe_cos(self, position, i):
        return cos(position / (10000 ** (2 * i) / self.embed_dim))

    def forward(self, x):
        # print(x.shape)
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [None]:
# UTILS

import copy
import torch.nn as nn

def replicate(block, N=6) -> nn.ModuleList:

    block_stack = nn.ModuleList([copy.deepcopy(block) for _ in range(N)])
    return block_stack

In [None]:
# ENCODER

import torch.nn as nn

def replicate(block, N=6) -> nn.ModuleList:

    block_stack = nn.ModuleList([copy.deepcopy(block) for _ in range(N)])
    return block_stack

class TransformerBlock(nn.Module):

    def __init__(self,
                 embed_dim=512,
                 heads=8,
                 expansion_factor=4,
                 dropout=0.2
                 ):

        super(TransformerBlock, self).__init__()

        self.attention = MultiHeadAttention(embed_dim, heads)  # the multi-head attention
        self.norm = nn.LayerNorm(embed_dim)  # the normalization layer

        # the FeedForward layer
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, expansion_factor * embed_dim),  # e.g: 512x(4*512) -> (512, 2048)
            nn.ReLU(),  # ReLU activation function
            nn.Linear(embed_dim * expansion_factor, embed_dim),  # e.g: 4*512)x512 -> (2048, 512)
        )

        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, key, query, value, mask=None):
        #################### Multi-Head Attention ####################
        # first, pass the key, query and value through the multi head attention layer
        attention_out = self.attention(key, query, value, mask)  # e.g.: 32x10x512

        # then add the residual connection
        attention_out = attention_out + value  # e.g.: 32x10x512

        # after that we normalize and use dropout
        attention_norm = self.dropout(self.norm(attention_out))  # e.g.: 32x10x512
        # print(attention_norm.shape)

        #################### Feed-Forwar Network ####################
        fc_out = self.feed_forward(attention_norm)  # e.g.:32x10x512 -> #32x10x2048 -> 32x10x512

        # Residual connection
        fc_out = fc_out + attention_norm  # e.g.: 32x10x512

        # dropout + normalization
        fc_norm = self.dropout(self.norm(fc_out))  # e.g.: 32x10x512

        return fc_norm


class Encoder(nn.Module):

    def __init__(self,
                 seq_len,
                 vocab_size,
                 embed_dim=512,
                 num_blocks=6,
                 expansion_factor=4,
                 heads=8,
                 dropout=0.2
                 ):

        super(Encoder, self).__init__()

        # define the embedding: (vocabulary size x embedding dimension)
        self.embedding = Embedding(vocab_size, embed_dim)

        # define the positional encoding: (embedding dimension x sequence length)
        self.positional_encoder = PositionalEncoding(embed_dim, seq_len)

        # define the set of blocks
        # so we will have 'num_blocks' stacked on top of each other
        self.blocks = replicate(TransformerBlock(embed_dim, heads, expansion_factor, dropout), num_blocks)

    def forward(self, x):
        out = self.positional_encoder(self.embedding(x))
        for block in self.blocks:
            out = block(out, out, out)

        # output shape: batch_size x seq_len x embed_size, e.g.: 32x10x512
        return out

In [None]:
# DECODER

import torch.nn as nn
import torch.nn.functional as F



class DecoderBlock(nn.Module):

    def __init__(self,
                 embed_dim=512,
                 heads=8,
                 expansion_factor=4,
                 dropout=0.2
                 ):

        super(DecoderBlock, self).__init__()

        # First define the Decoder Multi-head attention
        self.attention = MultiHeadAttention(embed_dim, heads)
        # normalization
        self.norm = nn.LayerNorm(embed_dim)
        # Dropout to avoid overfitting
        self.dropout = nn.Dropout(dropout)
        # finally th transformerBlock
        self.transformerBlock = TransformerBlock(embed_dim, heads, expansion_factor, dropout)

    def forward(self, key, query, x, mask):
        # pass the inputs to the decoder multi-head attention
        decoder_attention = self.attention(x, x, x, mask)
        # residual connection + normalization
        value = self.dropout(self.norm(decoder_attention + x))
        # finally the transformerBlock (multi-head attention -> residual + norm -> feed forward -> residual + norm)
        decoder_attention_output = self.transformerBlock(key, query, value)

        return decoder_attention_output


class Decoder(nn.Module):

    def __init__(self,
                 target_vocab_size,
                 seq_len,
                 embed_dim=512,
                 num_blocks=6,
                 expansion_factor=4,
                 heads=8,
                 dropout=0.2
                 ):

        super(Decoder, self).__init__()

        # define the embedding
        self.embedding = nn.Embedding(target_vocab_size, embed_dim)
        # the positional embedding
        self.positional_encoder = PositionalEncoding(embed_dim, seq_len)

        # define the set of decoders
        self.blocks = replicate(DecoderBlock(embed_dim, heads, expansion_factor, dropout), num_blocks)
        # dropout for overfitting
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, mask):
        x = self.dropout(self.positional_encoder(self.embedding(x)))  # 32x10x512

        for block in self.blocks:
            x = block(encoder_output, x, encoder_output, mask)

        return x

In [None]:
# TRANSFORMER

import torch
import torch.nn as nn
import torch.nn.functional as F

class Transformer(nn.Module):

    def __init__(self,
                 embed_dim,
                 src_vocab_size,
                 target_vocab_size,
                 seq_len,
                 num_blocks=6,
                 expansion_factor=4,
                 heads=8,
                 dropout=0.2):
        super(Transformer, self).__init__()
        self.target_vocab_size = target_vocab_size

        self.encoder = Encoder(seq_len=seq_len,
                               vocab_size=src_vocab_size,
                               embed_dim=embed_dim,
                               num_blocks=num_blocks,
                               expansion_factor=expansion_factor,
                               heads=heads,
                               dropout=dropout)

        self.decoder = Decoder(target_vocab_size=target_vocab_size,
                               seq_len=seq_len,
                               embed_dim=embed_dim,
                               num_blocks=num_blocks,
                               expansion_factor=expansion_factor,
                               heads=heads,
                               dropout=dropout)

        self.fc_out = nn.Linear(embed_dim, target_vocab_size)

    def make_trg_mask(self, trg):
        batch_size, trg_len = trg.shape
        # returns the lower triangular part of matrix filled with ones
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            batch_size, 1, trg_len, trg_len
        )
        return trg_mask

    def forward(self, source, target):
        trg_mask = self.make_trg_mask(target)
        enc_out = self.encoder(source)
        outputs = self.decoder(target, enc_out, trg_mask)
        output = F.softmax(self.fc_out(outputs), dim=-1)
        return output

In [None]:
# MAIN

import torch
import torch.nn as nn
import torch.optim as optim
#from utils import replicate
#from attention import MultiHeadAttention
from torch.utils.data import Dataset, DataLoader
import math
from tqdm import tqdm
#from encoder import TransformerBlock
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors
!pip install datasets
from datasets import load_dataset
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
from tokenizers.processors import TemplateProcessing
from tokenizers.normalizers import Sequence, Strip
import pandas as pd
import sys
import csv
import os
import tensorflow as tf
from tensorflow.keras.models import load_model
from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import math
from tqdm import tqdm
from sklearn.model_selection import train_test_split



In [None]:
class CustomTextDataset(Dataset):
    def __init__(self, csv_file):
        # self.text_data = text_data
        #self.vocab = vocab
        # self.tokenizer = tokenizer
        self.code_snippets_df = pd.read_csv(csv_file)

        # Convert the merged column into a list of strings
        self.code_snippets = self.code_snippets_df['merged'].tolist()

        self.tokenizer = self.prepare_tokenizer()

    def prepare_tokenizer(self):
        # Increase field size limit to handle large CSV fields
        csv.field_size_limit(12456763)

        # Initialize a tokenizer for Python code snippets
        tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

        # Normalizer: Strip extra spaces (no lowercase since Python is case-sensitive)
        tokenizer.normalizer = Sequence([Strip()])

        # Pre-Tokenizer: Split based on whitespace and handle byte-level characters
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([Whitespace()])

        # Tokenizer Trainer: Train on the Python code snippets
        trainer = trainers.WordPieceTrainer(
            vocab_size=50000,
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
            )

        # Train the tokenizer on the merged code snippets
        tokenizer.train_from_iterator(self.code_snippets, trainer=trainer)

        # Post-processing: Adding [CLS] and [SEP] tokens for sequence processing
        tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", tokenizer.token_to_id("[CLS]")),
                ("[SEP]", tokenizer.token_to_id("[SEP]"))
            ]
        )
        tokenizer.enable_padding(length=MAX_SEQ_LENGTH)
        tokenizer.enable_truncation(MAX_SEQ_LENGTH)
        return tokenizer


    def __len__(self):
        return len(self.code_snippets)

    def __getitem__(self, idx):
        text = self.code_snippets[idx]
        tokens = self.tokenizer.encode(text).tokens
        token_ids = torch.tensor([self.tokenizer.get_vocab().get(token, "[UNK]") for token in tokens], dtype=torch.long)
        # Return the input (src) and a shifted version as target (tgt)
        return token_ids[:-1], token_ids[1:]  # src, tgt



In [None]:
csv_file = (r'/content/modified_python_code_dataset.csv')

# Define the path where the model is saved
model_file = 'model.json'

# Define TensorBoard log directory
log_dir = "runs/transformer_experiment"

# Create a TensorBoard SummaryWriter
writer = SummaryWriter(log_dir)

# Check if the model file exists
if os.path.exists(model_file):
    print("Loading the pre-trained model...")
    model = torch.load(model_file)
else:
    print("Training the model from scratch...")

    EMBEDDING_SIZE = 512
    NHEAD = 8
    FFN_HID_DIM = 2048
    NUM_DECODER_LAYERS = 6
    MAX_SEQ_LENGTH = 100
    VOCAB_SIZE = 50000

    class DecoderBlock(nn.Module):
        def __init__(self, embed_dim=512, heads=8, expansion_factor=4, dropout=0.2):
            super(DecoderBlock, self).__init__()

            # First define the Decoder Multi-head attention
            self.attention = MultiHeadAttention(embed_dim, heads)
            # normalization
            self.norm = nn.LayerNorm(embed_dim)
            # Dropout to avoid overfitting
            self.dropout = nn.Dropout(dropout)
            # Finally the transformerBlock
            self.transformerBlock = TransformerBlock(embed_dim, heads, expansion_factor, dropout)

        def forward(self, key, query, x, mask):
            # Pass the inputs to the decoder multi-head attention
            decoder_attention = self.attention(x, x, x, mask)
            # Residual connection + normalization
            value = self.dropout(self.norm(decoder_attention + x))
            # Return the value (output after attention and normalization)
            return value

    class Decoder(nn.Module):
        def __init__(self, target_vocab_size, seq_len, embed_dim=512, num_blocks=6, expansion_factor=4, heads=8, dropout=0.2):
            """
            The Decoder part of the Transformer architecture.

            It is a set of stacked decoders on top of each other. In the paper, they used a stack of 6 decoders.
            """
            super(Decoder, self).__init__()

            # Define the embedding
            self.embedding = nn.Embedding(target_vocab_size, embed_dim)
            # The positional embedding
            self.positional_encoder = PositionalEncoding(embed_dim, seq_len)
            # Define the set of decoders
            self.blocks = nn.ModuleList([DecoderBlock(embed_dim, heads, expansion_factor, dropout) for _ in range(num_blocks)])
            # Dropout for overfitting
            self.dropout = nn.Dropout(dropout)

        def forward(self, x, mask):
            x = self.dropout(x)  # 32x10x512

            for block in self.blocks:
                x = block(x, x, x, mask)

            return x

    class PositionalEncoding(nn.Module):
        def __init__(self, embedding_dim, max_seq_length):
            super(PositionalEncoding, self).__init__()
            pe = torch.zeros(max_seq_length, embedding_dim)
            position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
            div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
            pe[:, 0::2] = torch.sin(position * div_term)
            pe[:, 1::2] = torch.cos(position * div_term)
            pe = pe.unsqueeze(0).transpose(0, 1)
            self.register_buffer('pe', pe)

        def forward(self, x):
            x = x + self.pe[:x.size(0), :]
            return x

    class TransformerModel(nn.Module):
        def __init__(self, VOCAB_SIZE):
            super(TransformerModel, self).__init__()
            self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_SIZE)
            self.pos_encoder = PositionalEncoding(EMBEDDING_SIZE, MAX_SEQ_LENGTH)
            self.transformer_decoder = Decoder(VOCAB_SIZE, MAX_SEQ_LENGTH, EMBEDDING_SIZE, heads=NHEAD)
            self.fc_out = nn.Linear(EMBEDDING_SIZE, VOCAB_SIZE)
            self.softmax = nn.Softmax(dim=2)

        def generate_square_subsequent_mask(self, sz):
            mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
            return mask

        def forward(self, src):
            src_mask = self.generate_square_subsequent_mask(src.size(1)).to(src.device)
            src = self.embedding(src) * math.sqrt(EMBEDDING_SIZE)
            src = self.pos_encoder(src)
            output = self.transformer_decoder(src, src_mask)
            output = self.fc_out(output)
            output = self.softmax(output)
            return output

if torch.cuda.is_available():
   device = torch.device(f'cuda:{torch.cuda.current_device()}')
        #device = torch.device('cuda')
else:
  device = torch.device('cpu')




Training the model from scratch...


In [None]:
 # Model definition
model = TransformerModel(VOCAB_SIZE).to(device)

    # DATA SPLITTING

dataset = CustomTextDataset(csv_file)
train_ratio=0.7
val_ratio=0.1
test_ratio=0.2

# Split dataset into train and temp (temp= train + test)

train_size=int(train_ratio*len(dataset))
temp_size=len(dataset)-train_size
train_dataset, temp_dataset=random_split(dataset,[train_size, temp_size])

# Split temp data into validation and test sets
val_size=int(val_ratio/(val_ratio+test_ratio)*temp_size)
test_size=temp_size-val_size
val_dataset, test_dataset=random_split(temp_dataset,[val_size,test_size])

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# TRAINING LOOP

NUM_EPOCHS = 50
LEARNING_RATE = 0.001

dataset = CustomTextDataset(csv_file)  # Initialize the dataset
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
#dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch+1}/{NUM_EPOCHS}', leave=True)

        for i, (src, tgt) in progress_bar:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()

            # Forward pass
            output = model(src)

            # Compute the loss
            loss = criterion(output.view(-1, VOCAB_SIZE), tgt.reshape(-1))

            # Backward pass
            loss.backward()

            # Update model parameters
            optimizer.step()

            # Accumulate the loss
            total_loss += loss.item()

        # Calculate average loss and perplexity
        avg_train_loss = total_loss / len(train_loader)
        train_perplexity = math.exp(avg_train_loss)

        # Log to TensorBoard
        writer.add_scalar("Loss/Train", avg_train_loss, epoch+1)
        writer.add_scalar("Perplexity/Train", train_perplexity, epoch+1)

        # VALIDATION LOOP

        model.eval()
        total_val_loss=0

        with torch.no_grad():
            for src, tgt in val_loader:
                src, tgt = src.to(device), tgt.to(device)
                output=model(src)
                val_loss=criterion(output.view(-1, VOCAB_SIZE), tgt.reshape(-1))
                total_val_loss += val_loss.item()

        # Calculate avg loss and perplexity to TB
        avg_val_loss = total_val_loss / len(val_loader)
        val_perplexity = math.exp(avg_val_loss)

        # Log validation loss and perplexity to TensorBoard
        writer.add_scalar("Loss/Validation", avg_val_loss, epoch+1)
        writer.add_scalar("Perplexity/Validation", val_perplexity, epoch+1)


        print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {avg_train_loss:.4f}, Train Perplexity: {train_perplexity:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Perplexity: {val_perplexity:.4f}')

# Save the trained model to a file
torch.save(model, model_file)

# Close TensorBoard writer
writer.close()

# Load the model from the saved file
# model = model.from_file("model.json")

Epoch 1/50:   1%|          | 238/24479 [2:00:39<197:36:48, 29.35s/it]

In [None]:
# import torch
# print(torch.cuda.is_available())
# print(torch.cuda.current_device())
# device=torch.device('cuda')
# device