# Language Translation Encoder Decoder

In [1]:
import torch
import torch.nn as nn

import pandas as pd
import os

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from torch.utils.data import Dataset , DataLoader

from Encoder import Encoder
from PositionalEmbeddings import RoPE
from Decoder import Decoder

import matplotlib.pyplot as plt
from IPython.display import clear_output
import pandas as pd

In [2]:
MAX_LEN = 20
BATCH_SIZE = 32
EMB_DIM = 512
NUM_HEADS = 8
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print(DEVICE)

cpu


# Downloading dataset from kaggle

In [3]:
"""Loading Dataset"""
import kagglehub
path = kagglehub.dataset_download("kuldeepsingharya/english-to-hindi-parallel-dataset")
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/lamao/.cache/kagglehub/datasets/kuldeepsingharya/english-to-hindi-parallel-dataset/versions/1


# Tokenizer I don't understand this code I asked gpt to help me in tokenizer cuz i am too lazy to write the whole by myself

In [4]:
"""Tokenizers"""

df = pd.read_csv(path+"/newdata.csv")
print(df["english_sentence"][0])
print(df["hindi_sentence"][0])
eng_corpus = " ".join(df['english_sentence'].astype(str).tolist()).lower()
hin_corpus = " ".join(list(df['hindi_sentence'])) 

eng_trainer = WordLevelTrainer(
    vocab_size = 10_000,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]","[SOS]","[EOS]",]
)
english_tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
english_tokenizer.pre_tokenizer = Whitespace()
english_tokenizer.train_from_iterator([eng_corpus],trainer=eng_trainer)

hi_trainer = WordLevelTrainer(
    vocab_size = 10_000,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]","[SOS]","[EOS]",]
)
hi_tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
hi_tokenizer.pre_tokenizer = Whitespace()
hi_tokenizer.train_from_iterator([hin_corpus],trainer=hi_trainer)



politicians do not have permission to do what needs to be done.
राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .


In [5]:
class TranslationDataset(Dataset):
    def __init__(self, dataset_path, english_tokenizer, hi_tokenizer, max_length):
        super().__init__()
        self.df = pd.read_csv(dataset_path)
        
        self.english_tokenizer = english_tokenizer
        self.hi_tokenizer = hi_tokenizer
        self.max_length = max_length

        self.pad_id_en = self.english_tokenizer.token_to_id("[PAD]")
        self.sos_id_en = self.english_tokenizer.token_to_id("[SOS]")
        self.eos_id_en = self.english_tokenizer.token_to_id("[EOS]")

        self.pad_id_hi = self.hi_tokenizer.token_to_id("[PAD]")
        self.sos_id_hi = self.hi_tokenizer.token_to_id("[SOS]")
        self.eos_id_hi = self.hi_tokenizer.token_to_id("[EOS]")

        # Filter sentences according to tokenized length
        self.english_sentences = []
        self.hindi_sentences = []

        for en_sent, hi_sent in zip(self.df['english_sentence'].astype(str), self.df['hindi_sentence'].astype(str)):
            en_ids = [self.sos_id_en] + self.english_tokenizer.encode(en_sent).ids + [self.eos_id_en]
            hi_ids = [self.sos_id_hi] + self.hi_tokenizer.encode(hi_sent).ids + [self.eos_id_hi]

            if len(en_ids) <= self.max_length and len(hi_ids) <= self.max_length:
                self.english_sentences.append(en_sent)
                self.hindi_sentences.append(hi_sent)

    def pad_sequence(self, seq, pad_id):
        if len(seq) > self.max_length:
            seq = seq[:self.max_length]
        else:
            seq = seq + [pad_id] * (self.max_length - len(seq))
        return seq

    def __getitem__(self, index):
        # Encode and add SOS/EOS
        english_ids = [self.sos_id_en] + self.english_tokenizer.encode(self.english_sentences[index].lower()).ids + [self.eos_id_en]
        english_ids = self.pad_sequence(english_ids, self.pad_id_en)

        encoder_input = english_ids

        decoder_input = self.pad_sequence(
            [self.sos_id_hi] + self.hi_tokenizer.encode(self.hindi_sentences[index]).ids,
            self.pad_id_hi
        )
        decoder_output = self.pad_sequence(
            self.hi_tokenizer.encode(self.hindi_sentences[index]).ids + [self.eos_id_hi],
            self.pad_id_hi
        )

        return torch.tensor(encoder_input), torch.tensor(decoder_input), torch.tensor(decoder_output)

    def __len__(self):
        return len(self.english_sentences)


In [6]:
dataset = TranslationDataset(path+"/newdata.csv",english_tokenizer,hi_tokenizer,max_length=MAX_LEN)
dataloader = DataLoader(dataset,batch_size=BATCH_SIZE)

# Model Architecture (USING Rotatory positional Encodding , not sin cos one)

In [7]:
class TranslationModel(nn.Module):
    def __init__(self, emb_dim,hidden_dim,n_heads,max_len,num_encoders,num_decoders,input_vocab_size,out_vocab_size):
        super().__init__()
        rope = RoPE(emb_dim=emb_dim,num_heads=n_heads,seq_len=max_len,device=DEVICE)
        RoPE_Precomputed_Angles = rope.precompute_angles()
        self.en_embedding = nn.Embedding(input_vocab_size,embedding_dim=emb_dim)
        self.hi_embedding = nn.Embedding(out_vocab_size,embedding_dim=emb_dim)
        self.encoders = nn.ModuleList(
            [Encoder(emb_dim=emb_dim,hidden_dim=hidden_dim,n_heads=n_heads,RoPE=True,RoPE_Precomputed_Angles=RoPE_Precomputed_Angles) for i in range(num_encoders)]
        )
        self.decoders = nn.ModuleList(
            [Decoder(emb_dim=emb_dim,hidden_dim=hidden_dim,n_heads=n_heads,RoPE=True,RoPE_Precomputed_Angles=RoPE_Precomputed_Angles) for i in range(num_decoders)]
        )
        self.generation_head = nn.Sequential(
            nn.Linear(emb_dim,emb_dim),
            nn.ReLU(),
            nn.Linear(emb_dim,out_vocab_size)
        )
    def forward(self,encoder_input , decoder_input):
        encoder_input = self.en_embedding(encoder_input)
        for encoder in self.encoders:
            encoder_input = encoder(encoder_input)
        encoder_out = encoder_input
        decoder_input = self.hi_embedding(decoder_input)
        for decoder in self.decoders:
            decoder_input = decoder(decoder_input,encoder_out)
        decoder_out = decoder_input
        out = self.generation_head(decoder_out)
        return out 

In [8]:
model = TranslationModel(
        emb_dim=EMB_DIM,
        hidden_dim=EMB_DIM,
        n_heads=NUM_HEADS,
        max_len=MAX_LEN,
        num_encoders=6,
        num_decoders=6,
        input_vocab_size=english_tokenizer.get_vocab_size(),
        out_vocab_size=hi_tokenizer.get_vocab_size()
    )
model = model.to(DEVICE)
criterion = nn.CrossEntropyLoss()


In [9]:
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=1e-5
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,factor=0.1,patience=1,min_lr=1e-6,verbose=True
)



## 25 epochs on 1e-4 lr and 25 on le-6

In [13]:
"""
Training loop
"""

EPOCHS=100

if os.path.exists("checkpoints/3_LanguageTranslation.pt"):
    model.load_state_dict(torch.load("checkpoints/3_LanguageTranslation.pt",map_location=DEVICE))
else:
    losses=[]
    loader_len = len(dataloader)
    plt.ion()
    for epoch in range(EPOCHS):
        running_loss=0.0
        for batch_num,batch in enumerate(dataloader):
            optimizer.zero_grad()
            encoder_input , decoder_input , decoder_out  = batch 
            encoder_input = encoder_input.to(DEVICE)
            decoder_input = decoder_input.to(DEVICE)
            decoder_out = decoder_out.to(DEVICE)
            out = model(encoder_input,decoder_input)
            out = out.view(-1, out.shape[-1])   
            decoder_out = decoder_out.view(-1)
            loss = criterion(out,decoder_out)
            loss.backward()
            optimizer.step()

            losses.append(loss.item())
            running_loss+=loss.item()

            if batch_num % 100 == 0 or batch_num == len(dataloader) - 1:
                clear_output(wait=True)
                plt.figure(figsize=(10, 4))
                plt.plot(losses, label='Batch Loss', alpha=0.7)

                if len(losses) > 10:
                    ewma = pd.Series(losses).ewm(span=200).mean()
                    plt.plot(ewma, label='Smoothed Loss (EWMA)', color='red')

                plt.xlabel('Batch')
                plt.ylabel('Loss')
                plt.title(f'Epoch {epoch+1}/{EPOCHS}')
                plt.legend()
                plt.grid(True)
                plt.show()

            print(f"\rEpoch {epoch+1}/{EPOCHS} Batch {batch_num}/{loader_len} loss {loss.item():.6f} last lr : {scheduler.get_last_lr()}", end='  ', flush=True)
            avrage_train_loss = running_loss / len(dataloader)
        scheduler.step(avrage_train_loss)
        

# Inference on FP32 model

In [14]:
def pad_sequence(seq, pad_id):
    if len(seq) >MAX_LEN:
        seq = seq[:MAX_LEN]
    else:
        seq = seq + [pad_id] * (MAX_LEN - len(seq))
    return seq


texts = [
    "How are you?",
    "What is your name?",
    "Where do you live?",
    "I am hungry.",
    "Please help me.",
    "Thank you very much.",
    "I love my family.",
    "It is raining today.",
    "I like to read books.",
    "The sun is shining.",
    "Can you speak English?",
    "I am learning Hindi.",
    "Where is the school?",
    "This is my friend.",
    "I want some water.",
    "The sky is blue.",
    "He is my brother.",
    "She is very kind.",
    "I have a cat.",
    "Let’s go to the market."
]

         
for text in texts:
  print(f"English Sentence : {text}")
  text = text.lower()
  encoder_input = torch.tensor( pad_sequence( [english_tokenizer.token_to_id("[SOS]")]+english_tokenizer.encode(text).ids+[english_tokenizer.token_to_id("[EOS]")] , english_tokenizer.token_to_id("[PAD]") ) ).unsqueeze(0)
  decoder_input = torch.tensor( [hi_tokenizer.token_to_id("[SOS]")] ).unsqueeze(0)

  # print(encoder_input.shape,decoder_input.shape)

  decoder_in = [hi_tokenizer.token_to_id("[SOS]")]
  encoder_input = encoder_input.to(DEVICE)
  decoder_input = decoder_input.to(DEVICE)
  print("Hindi Translation : ",end='')
  for i in range(20):
    decoder_input = torch.tensor(decoder_in).to(DEVICE).unsqueeze(0)
    with torch.no_grad():
      out = model(encoder_input , decoder_input)

    probs = torch.softmax(out,dim=-1)
    preds = torch.argmax(out,dim=-1)
    preds = preds.detach().cpu()
    predicted_word = hi_tokenizer.id_to_token(preds[0][-1])
    decoder_in.append(preds[0][-1])
    if predicted_word=="[EOS]":break
    print(predicted_word,end=' ')
  print()



English Sentence : How are you?
Hindi Translation : आप कैसे हैं ? 
English Sentence : What is your name?
Hindi Translation : आपका नाम क्या है ? 
English Sentence : Where do you live?
Hindi Translation : यहाँ कहाँ रहते हैं ? 
English Sentence : I am hungry.
Hindi Translation : मैं भूख हो गया हूँ । 
English Sentence : Please help me.
Hindi Translation : मेरी मदद करें । 
English Sentence : Thank you very much.
Hindi Translation : बहुत बहुत धन्यवाद ! 
English Sentence : I love my family.
Hindi Translation : मुझे अपने परिवार की याद [UNK] । 
English Sentence : It is raining today.
Hindi Translation : आज इस बात का [UNK] है । 
English Sentence : I like to read books.
Hindi Translation : मैं पढ़ पढ़ना पसंद करती हूँ । 
English Sentence : The sun is shining.
Hindi Translation : सूर्य [UNK] है । 
English Sentence : Can you speak English?
Hindi Translation : क्या आप अंग्रेजी पढ़ सकते हैं ? 
English Sentence : I am learning Hindi.
Hindi Translation : मैं हिन्दी बोलने [UNK] हूँ । 
English Sentence : 

# MOdel size is around 150 mb -> quantizing to int8 (67 mb)

aah not much size reduction, next time use QAT instead of quantizing linear layers

In [15]:
import torch
from torch.quantization import quantize_dynamic
model = TranslationModel(
        emb_dim=EMB_DIM,
        hidden_dim=EMB_DIM,
        n_heads=NUM_HEADS,
        max_len=MAX_LEN,
        num_encoders=6,
        num_decoders=6,
        input_vocab_size=english_tokenizer.get_vocab_size(),
        out_vocab_size=hi_tokenizer.get_vocab_size()
    )
model.load_state_dict(torch.load("checkpoints/3_LanguageTranslation.pt",map_location='cpu'))
model.eval()
quantized_model = quantize_dynamic(
    model, 
    {torch.nn.Linear},
    dtype=torch.qint8
)
torch.save(quantized_model.state_dict(), "checkpoints/3_LanguageTranslationINT8.pt")


# Inferencing with int8 one

In [None]:
model = TranslationModel(
        emb_dim=EMB_DIM,
        hidden_dim=EMB_DIM,
        n_heads=NUM_HEADS,
        max_len=MAX_LEN,
        num_encoders=6,
        num_decoders=6,
        input_vocab_size=english_tokenizer.get_vocab_size(),
        out_vocab_size=hi_tokenizer.get_vocab_size()
    )
model.eval()
quantized_model = quantize_dynamic(
    model, 
    {torch.nn.Linear},
    dtype=torch.qint8
)
quantized_model.load_state_dict(torch.load("checkpoints/3_LanguageTranslationINT8.pt", map_location="cpu"))
quantized_model.eval()

def pad_sequence(seq, pad_id):
    if len(seq) >MAX_LEN:
        seq = seq[:MAX_LEN]
    else:
        seq = seq + [pad_id] * (MAX_LEN - len(seq))
    return seq


texts = [
    "How are you?",
    "What is your name?",
    "Where do you live?",
    "I am hungry.",
    "Please help me.",
    "Thank you very much.",
    "I love my family.",
    "It is raining today.",
    "I like to read books.",
    "The sun is shining.",
    "Can you speak English?",
    "I am learning Hindi.",
    "Where is the school?",
    "This is my friend.",
    "I want some water.",
    "The sky is blue.",
    "He is my brother.",
    "She is very kind.",
    "I have a cat.",
    "Let’s go to the market."
]

         
for text in texts:
  print(f"English Sentence : {text}")
  text = text.lower()
  encoder_input = torch.tensor( pad_sequence( [english_tokenizer.token_to_id("[SOS]")]+english_tokenizer.encode(text).ids+[english_tokenizer.token_to_id("[EOS]")] , english_tokenizer.token_to_id("[PAD]") ) ).unsqueeze(0)
  decoder_input = torch.tensor( [hi_tokenizer.token_to_id("[SOS]")] ).unsqueeze(0)

  # print(encoder_input.shape,decoder_input.shape)

  decoder_in = [hi_tokenizer.token_to_id("[SOS]")]
  encoder_input = encoder_input.to(DEVICE)
  decoder_input = decoder_input.to(DEVICE)
  print("Hindi Translation : ",end='')
  for i in range(20):
    decoder_input = torch.tensor(decoder_in).to(DEVICE).unsqueeze(0)
    with torch.no_grad():
      out = quantized_model(encoder_input , decoder_input)

    probs = torch.softmax(out,dim=-1)
    preds = torch.argmax(out,dim=-1)
    preds = preds.detach().cpu()
    predicted_word = hi_tokenizer.id_to_token(preds[0][-1])
    decoder_in.append(preds[0][-1])
    if predicted_word=="[EOS]":break
    print(predicted_word,end=' ')
  print()



  device=storage.device,


English Sentence : How are you?
Hindi Translation : आप कौन हैं ? 
English Sentence : What is your name?
Hindi Translation : आपका नाम क्या है ? 
English Sentence : Where do you live?
Hindi Translation : यहाँ कहाँ रहते हैं ? 
English Sentence : I am hungry.
Hindi Translation : मैं [UNK] महसूस कर रही हूँ । 
English Sentence : Please help me.
Hindi Translation : मेरी मदद करें । 
English Sentence : Thank you very much.
Hindi Translation : बहुत बहुत धन्यवाद । 
English Sentence : I love my family.
Hindi Translation : मुझे अपने परिवार की याद [UNK] । 
English Sentence : It is raining today.
Hindi Translation : आज इस बात पर [UNK] है । 
English Sentence : I like to read books.
Hindi Translation : मैं पढ़ पढ़ना पसंद करती हूँ । 
English Sentence : The sun is shining.
Hindi Translation : सूर्य [UNK] है । 
English Sentence : Can you speak English?
Hindi Translation : क्या आप अंग्रेजी पढ़ सकते हैं ? 
English Sentence : I am learning Hindi.
Hindi Translation : मैं हिन्दी बोलने [UNK] हूँ । 
English Sent