<a href="https://www.kaggle.com/code/fahmikazimd/ml-project-bn-to-en-transformer-model?scriptVersionId=172010617" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Imports**

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import timm
import torch.optim as optim

from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer
from transformers import AutoTokenizer, AutoModel

import math
import numpy as np

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# **Config**

In [None]:
class Config:
    tokeniserModel = "bert-base-multilingual-cased"
    vocabSize = 119547
    maxLength = 1024
    dModel = 768
    numHeads = 3
    numLayers = 6
    dropout = 0.1
    learningRate = 1e-3
    batchSize = 8
    epochs = 100
    

# **Loading Dataset**

In [None]:
basDir= "/kaggle/input/samanantar/final_data/en-bn/"
EN_PATH = basDir + "train.en"
BN_PATH = basDir + "train.bn"

In [None]:
# print("EN_PATH:", EN_PATH)
# print("BN_PATH", BN_PATH)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, EN_PATH, BN_PATH):
        self.targetPath = EN_PATH
        self.sourcePath = BN_PATH
        self.tokenizer = AutoTokenizer.from_pretrained(Config.tokeniserModel)
        with open(self.sourcePath, mode='rt', encoding='utf-8') as file:
            self.source = file.read().strip().split('\n')
            
        with open(self.targetPath, mode='rt', encoding='utf-8') as file:
            self.target = file.read().strip().split('\n')
        print("Data read successfully")
            
    def __getitem__(self, index):
        src = tokenizer(self.source[index], padding="max_length", max_length=Config.maxLength, return_tensors="pt").to(device)
        tgt = tokenizer(self.target[index], padding="max_length", max_length=Config.maxLength, return_tensors="pt").to(device)
        src = src["input_ids"]
        tgt = tgt["input_ids"]
        return tgt, src
    
    def __len__(self):
        return len(self.source)

In [None]:
dataset = Dataset(EN_PATH, BN_PATH)

# Finding vocab size

In [None]:
VAL_SIZE = TEST_SIZE = (len(dataset) * 15) // 100
TRAIN_SIZE = len(dataset) - (VAL_SIZE + TEST_SIZE)
print("TRAIN_SIZE:", TRAIN_SIZE,",VAL_SIZE:", VAL_SIZE,",TEST_SIZE:", TEST_SIZE)

In [None]:
trainSet, valSet, testSet = torch.utils.data.random_split(dataset, [TRAIN_SIZE, VAL_SIZE, TEST_SIZE])

In [None]:
trainLoader = torch.utils.data.DataLoader(trainSet, batch_size=Config.batchSize, shuffle=2)

In [None]:
validationLoader = torch.utils.data.DataLoader(valSet, batch_size=Config.batchSize, shuffle=2)

In [None]:
testLoader = torch.utils.data.DataLoader(testSet, batch_size=Config.batchSize, shuffle=2)

# **Implementation**

# Positional Encoder

In [None]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len):
        super(PositionalEncoder, self).__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len
        
        # Compute the positional encodings once in the constructor
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add the positional encodings to the input tensor
        x = x + self.pe[:, :x.size(1)]
        return x


# Masked Multihead Attention

In [None]:
class MaskedMultiheadAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.0):
        super(MaskedMultiheadAttention, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead)
        self.dropout = nn.Dropout(dropout)
        
    def generate_mask(self, seq_len):
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
        mask = mask == 0
        mask = mask.float()
        mask = mask.masked_fill(mask, float('-inf'))
        return mask
        
    def forward(self, query, key, value):
        
        # Compute input sequence length
        print(query.shape, type(query))
        seq_len = query.shape[1]
        
        # Generate mask
        mask = self.generate_mask(seq_len).to(device)
        
        # Transpose input for multi-head attention
        query = query.permute(1, 0, 2)
        key = key.permute(1, 0, 2)
        value = value.permute(1, 0, 2)
        
        # Compute masked multi-head attention
        output, _ = self.multihead_attn(query, key, value, attn_mask=mask)
        
        # Transpose output back to original shape
        output = output.permute(1, 0, 2)
        
        # Apply dropout
        output = self.dropout(output)
        
        
        return output

# Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, dModel, numHeads, dropout):
        super(Encoder, self).__init__()
        self.multiheadAttention = nn.MultiheadAttention(dModel, numHeads)
        self.norm = nn.LayerNorm(dModel)
        self.feedForward = nn.Sequential(
            nn.Linear(dModel, 2048),
            nn.ReLU(),
            nn.Linear(2048, dModel)
        )
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, source):
        sourceAttentionOutput, _ = self.multiheadAttention(source, source, source)
        source = source + self.dropout(sourceAttentionOutput)
        source = self.norm(source)
#         print(type(self.feedForward(source)))
        feedForwardOutput = self.feedForward(source)
        source = source + self.dropout(feedForwardOutput)
        source = self.norm(source)
        return source

# Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, dModel, numHeads, dropout):
        super(Decoder, self).__init__()
        self.multiheadAttention = nn.MultiheadAttention(dModel, numHeads)
        self.maskedMultiheadAttention = MaskedMultiheadAttention(dModel, numHeads, dropout)
        self.norm = nn.LayerNorm(dModel)
        self.feedForward = nn.Sequential(
            nn.Linear(dModel, 2048),
            nn.ReLU(),
            nn.Linear(2048, dModel)
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, source, target):
        targetAttentionOutput = self.maskedMultiheadAttention(target, target, target)
#         target = target + self.dropout(targetAttentionOutput)
        target = self.norm(target)
        
        # Pad the source and target tensors along the sequence_length dimension to make them have the same length
        maxLength = max(source.size(1), target.size(1))
        source = torch.nn.functional.pad(source, (0, 0, 0, maxLength - source.size(1)))
        target = torch.nn.functional.pad(target, (0, 0, 0, maxLength - target.size(1)))
        
        multiheadAttentionOutput = self.multiheadAttention(source, source, target)
#         target = target + self.dropout(multiheadAttentionOutput)
        target = self.norm(target)
        feedForwardOutput = self.feedForward(source)
        target = target + self.dropout(feedForwardOutput)
        target = self.norm(target)
        print("source:", type(source))
        print("target:", type(target))
        return source, target

# Transformer Layer

In [None]:
class TransformerLayer(nn.Module):
    def __init__(self, dModel, numHeads, dropout):
        super(TransformerLayer, self).__init__()
        self.encoder = Encoder(dModel, numHeads, dropout)
        self.decoder = Decoder(dModel, numHeads, dropout)
        
    def forward(self, source, target):
        source = self.encoder(source)
        source, target = self.decoder(source, target)
        return source, target

# Transformer

In [None]:
# Define the Transformer model
class Transformer(nn.Module):
    def __init__(self, vocabSize, dModel, numHeads, numLayers, dropout):
        super(Transformer, self).__init__()
        self.inputEmbeddings = nn.Embedding(vocabSize, dModel)
        self.outputEmbeddings = nn.Embedding(vocabSize, dModel)
        self.positionalEncoder = PositionalEncoder(dModel, Config.vocabSize)
        self.transformerLayers = nn.ModuleList([TransformerLayer(dModel, numHeads, dropout) for _ in range(numLayers)])
        self.fullyConnected = nn.Linear(dModel, vocabSize)
        
    def forward(self, source, target):

        
        source = self.inputEmbeddings(source)
        source = self.positionalEncoder(source)
        target = self.outputEmbeddings(target)
        target = self.positionalEncoder(target)
        for layer in self.transformerLayers:
            source, target = layer(source, target)
        output = self.fullyConnected(target)
        return output


# **Train**

In [None]:
import gc

In [None]:
# Define training and validation functions
def train(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for en_data, bn_data in iterator:
        en_data, bn_data = en_data.to(device), bn_data.to(device)
        optimizer.zero_grad()
        output = model(en_data, bn_data[:-1])
        output = output.reshape(-1, output.shape[2])
        bn_data = bn_data[1:].reshape(-1)
        loss = criterion(output, bn_data)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for en_data, bn_data in iterator:
            en_data, bn_data = en_data.to(device), bn_data.to(device)
            output = model(en_data, bn_data[:-1])
            output = output.reshape(-1, output.shape[2])
            bn_data = bn_data[1:].reshape(-1)
            loss = criterion(output, bn_data)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [None]:
# model = Transformer(Config.vocabSize, Config.dModel, Config.numHeads, Config.numLayers, Config.dropout).to(device)
# optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0, lr=Config.learningRate)
# criterion = nn.CrossEntropyLoss()
# for epoch in range(1, 100):
#     print("Epoch", epoch)
#     train_loss = train_step(model, trainLoader, optimizer)
#     print(f"Train Loss for {epoch} : {train_loss}")

In [None]:
# Train the model
best_val_loss = float('inf')
model = Transformer(Config.vocabSize, Config.dModel, Config.numHeads, Config.numLayers, Config.dropout).to(device)
optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0, lr=Config.learningRate)
criterion = nn.CrossEntropyLoss()
for epoch in range(Config.epochs):
    train_loss = train(model, trainLoader, optimizer, criterion, device)
    val_loss = evaluate(model, validationLoader, criterion, device)
    print(f'Epoch {epoch+1} Train Loss: {train_loss:.3f} Val Loss: {val_loss:.3f}')
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')

In [None]:
# def train_step(model, trainLoader, optimizer):
#     train_loss = 0
    
#     for batch, (target.to(device), source.to(device)) in enumerate(trainLoader):
#         model.train()
#         output = model(source, target)
#         loss = loss_function(output, target)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         train_loss += loss.item()
        
#         if batch % 50 == 0:
#             print(f"Train Loss for batch#{batch} : {loss.item()}")
        
#         del source, target, output
#         gc.collect()
        
        
        
    
#     train_loss /= len(train_loss)
#     return train_loss