In [51]:
import pandas as pd
df = pd.read_csv('translation.csv')
df.head()

Unnamed: 0,Index,English,Tamil
0,1,Let's try something.,ஏதாவது முயற்சி செய்வோம்.
1,2,I have to go to sleep.,நான் தூங்கப் போக வேண்டும்.
2,3,Today is June 18th and it is Muiriel's birthday!,இன்று ஜூன் 18 மற்றும் அது Muiriel பிறந்த நாள்!
3,4,Muiriel is 20 now.,முய்ரியலுக்கு இப்போது 20 வயதாகிறது.
4,5,"The password is ""Muiriel"".","கடவுச்சொல் ""Muiriel""."


In [52]:
import re

def clean_text(text):
    # Keep only Tamil/English letters and numbers, remove all other characters
    return re.sub(r'[^a-zA-Z0-9\u0B80-\u0BFF\s]', '', text)

# Apply cleaning function to English and Tamil columns
df['English'] = df['English'].apply(clean_text)  # Clean English column
df['Tamil'] = df['Tamil'].apply(clean_text)  # Clean Tamil column

df

Unnamed: 0,Index,English,Tamil
0,1,Lets try something,ஏதாவது முயற்சி செய்வோம்
1,2,I have to go to sleep,நான் தூங்கப் போக வேண்டும்
2,3,Today is June 18th and it is Muiriels birthday,இன்று ஜூன் 18 மற்றும் அது Muiriel பிறந்த நாள்
3,4,Muiriel is 20 now,முய்ரியலுக்கு இப்போது 20 வயதாகிறது
4,5,The password is Muiriel,கடவுச்சொல் Muiriel
...,...,...,...
497,498,The witnesses were able to refute the false te...,சந்தேக நபரின் பொய் சாட்சியத்தை சாட்சிகளால் மறு...
498,499,The guild leader relegated Vince to a lesser o...,கில்ட் தலைவர் வின்ஸை ஒரு குறைந்த அதிகாரியாக மா...
499,500,Joan of Arc refused to renounce her belief tha...,ஜோன் ஆஃப் ஆர்க் தான் கேட்ட குரல் கடவுளிடமிருந்...
500,501,Shocked by the events of September 11th politi...,செப்டம்பர் 11 சம்பவங்களால் அதிர்ச்சியடைந்த உலக...


In [53]:
# Define a function to check if any English character is present in the Tamil text
def contains_english(text):
    # This regex checks if the text contains any character from the English alphabet
    return bool(re.search(r'[a-zA-Z]', text))

# Apply the function to filter out rows where Tamil contains English words
df = df[~df['Tamil'].apply(contains_english)]
df

Unnamed: 0,Index,English,Tamil
0,1,Lets try something,ஏதாவது முயற்சி செய்வோம்
1,2,I have to go to sleep,நான் தூங்கப் போக வேண்டும்
3,4,Muiriel is 20 now,முய்ரியலுக்கு இப்போது 20 வயதாகிறது
5,6,I will be back soon,விரைவில் திரும்பி வருவேன்
6,7,Im at a loss for words,எனக்கு வார்த்தைகள் வரவில்லை
...,...,...,...
497,498,The witnesses were able to refute the false te...,சந்தேக நபரின் பொய் சாட்சியத்தை சாட்சிகளால் மறு...
498,499,The guild leader relegated Vince to a lesser o...,கில்ட் தலைவர் வின்ஸை ஒரு குறைந்த அதிகாரியாக மா...
499,500,Joan of Arc refused to renounce her belief tha...,ஜோன் ஆஃப் ஆர்க் தான் கேட்ட குரல் கடவுளிடமிருந்...
500,501,Shocked by the events of September 11th politi...,செப்டம்பர் 11 சம்பவங்களால் அதிர்ச்சியடைந்த உலக...


In [54]:
import pandas as pd

# Define the WordLevelTokenizer class (use the class from the previous answer)
class WordLevelTokenizer:
    def __init__(self, special_tokens=None):
        self.word_to_id = {}
        self.id_to_word = {}
        self.special_tokens = special_tokens or []
        self.build_vocab(self.special_tokens)

    def build_vocab(self, tokens):
        for token in tokens:
            self.add_token(token)

    def add_token(self, token):
        if token not in self.word_to_id:
            idx = len(self.word_to_id)
            self.word_to_id[token] = idx
            self.id_to_word[idx] = token

    def tokenize(self, text):
        return text.split()

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.word_to_id.get(token, self.word_to_id.get("<UNK>")) for token in tokens]

    def decode(self, token_ids):
        return " ".join(self.id_to_word.get(idx, "<UNK>") for idx in token_ids)

    def fit_on_texts(self, texts):
        for text in texts:
            tokens = self.tokenize(text)
            self.build_vocab(tokens)

    def vocab_size(self):
        return len(self.word_to_id)

In [55]:
# Initialize tokenizer with special tokens
special_tokens = ["<UNK>", "<PAD>", "<SOS>", "<EOS>"]
tamil_tokenizer = WordLevelTokenizer(special_tokens=special_tokens)
english_tokenizer = WordLevelTokenizer(special_tokens=special_tokens)

In [56]:
# Fit tokenizer on the 'text' column
tamil_tokenizer.fit_on_texts(df['Tamil'])
print("Vocabulary:", len(tamil_tokenizer.word_to_id))

Vocabulary: 1917


In [57]:
english_tokenizer.fit_on_texts(df['English'])
print("Vocabulary:", len(english_tokenizer.word_to_id))

Vocabulary: 1442


In [58]:
# Tokenize the Tamil text in your dataframe
df['Tokenized_Tamil'] = df['Tamil'].apply(lambda x: tamil_tokenizer.encode(x))
df['Tokenized_English'] = df['English'].apply(lambda x: english_tokenizer.encode(x))
# Print the dataframe with tokenized Tamil text
df

Unnamed: 0,Index,English,Tamil,Tokenized_Tamil,Tokenized_English
0,1,Lets try something,ஏதாவது முயற்சி செய்வோம்,"[4, 5, 6]","[4, 5, 6]"
1,2,I have to go to sleep,நான் தூங்கப் போக வேண்டும்,"[7, 8, 9, 10]","[7, 8, 9, 10, 9, 11]"
3,4,Muiriel is 20 now,முய்ரியலுக்கு இப்போது 20 வயதாகிறது,"[11, 12, 13, 14]","[12, 13, 14, 15]"
5,6,I will be back soon,விரைவில் திரும்பி வருவேன்,"[15, 16, 17]","[7, 16, 17, 18, 19]"
6,7,Im at a loss for words,எனக்கு வார்த்தைகள் வரவில்லை,"[18, 19, 20]","[20, 21, 22, 23, 24, 25]"
...,...,...,...,...,...
497,498,The witnesses were able to refute the false te...,சந்தேக நபரின் பொய் சாட்சியத்தை சாட்சிகளால் மறு...,"[1752, 1885, 1886, 1887, 1888, 1889, 1890]","[135, 1202, 308, 866, 9, 1415, 41, 1416, 1417,..."
498,499,The guild leader relegated Vince to a lesser o...,கில்ட் தலைவர் வின்ஸை ஒரு குறைந்த அதிகாரியாக மா...,"[1891, 1892, 1893, 29, 1894, 1895, 1896, 1430,...","[135, 1418, 1238, 1419, 1420, 9, 22, 1421, 142..."
499,500,Joan of Arc refused to renounce her belief tha...,ஜோன் ஆஃப் ஆர்க் தான் கேட்ட குரல் கடவுளிடமிருந்...,"[1899, 515, 1900, 721, 1901, 1902, 1903, 349, ...","[1424, 129, 1425, 1426, 9, 1427, 291, 1319, 12..."
500,501,Shocked by the events of September 11th politi...,செப்டம்பர் 11 சம்பவங்களால் அதிர்ச்சியடைந்த உலக...,"[1906, 1907, 1908, 1909, 1405, 758, 1540, 1910...","[1431, 941, 41, 1432, 129, 1433, 1434, 1435, 5..."


In [59]:
z = 0
t = 0
for i in df['Tokenized_Tamil']:
    for j in i:
        t = t +1
        if j == 0:
           z=z+1
print(z)
print(t)

0
3444


In [60]:
tamil_idx = []
for idx , i in enumerate(df['Tokenized_Tamil']):
    if len(i) > 16:
        tamil_idx.append(idx)

print(len(tamil_idx))

df.shape

11


(499, 5)

In [61]:
df = df.drop(index=df.index[tamil_idx])

df = df.reset_index(drop=True)

df.shape

(488, 5)

In [62]:

english_idx = []
for idx , i in enumerate(df['Tokenized_English']):
    if len(i) > 16:
        english_idx.append(idx)

print(len(english_idx))

df.shape

43


(488, 5)

In [63]:
df = df.drop(index=df.index[english_idx])

df = df.reset_index(drop=True)

df.shape

(445, 5)

In [64]:
import numpy as np

# Example of the maximum padding length
max_pad = 18
cls_token = 2
sep_token = 3

# Function to pad sequences
def pad_sequence_source(tokens, max_len, cls_token=2,sep_token = 3):
    # Add the cls token at the beginning
    padded_tokens = [cls_token] + tokens + [sep_token]
    #padded_tokens = padded_tokens + [sep_token]

    # Pad the sequence if it's shorter than max_len
    if len(padded_tokens) < max_len:
        padded_tokens.extend([1] * (max_len - len(padded_tokens)))
    # Truncate if it's longer than max_len
    else:
        padded_tokens = padded_tokens[:max_len]

    return padded_tokens

def pad_sequence_target(tokens, max_len, cls_token = 2):
    # Add the cls token at the beginning
    padded_tokens = [cls_token] + tokens

    # Pad the sequence if it's shorter than max_len
    if len(padded_tokens) < max_len:
        padded_tokens.extend([1] * (max_len - len(padded_tokens)))
    # Truncate if it's longer than max_len
    else:
        padded_tokens = padded_tokens[:max_len]

    return padded_tokens

def pad_sequence_label(tokens, max_len, sep_token = 3):
    # Add the cls token at the beginning
    padded_tokens = tokens + [sep_token]

    # Pad the sequence if it's shorter than max_len
    if len(padded_tokens) < max_len:
        padded_tokens.extend([1] * (max_len - len(padded_tokens)))
    # Truncate if it's longer than max_len
    else:
        padded_tokens = padded_tokens[:max_len]

    return padded_tokens

# Apply padding and add CLS token to both English and Tamil columns
df['Padded_English'] = df['Tokenized_English'].apply(lambda x: pad_sequence_source(x, max_pad, cls_token,sep_token))
df['Padded_Tamil'] = df['Tokenized_Tamil'].apply(lambda x: pad_sequence_target(x, max_pad, cls_token))
df['Padded_Tamil_Target'] = df['Tokenized_Tamil'].apply(lambda x: pad_sequence_label(x, max_pad,sep_token))
# Verify the result
#print(df[['Padded_English', 'Padded_Tamil','Padded_Tamil_Target']].head(-10))
df

Unnamed: 0,Index,English,Tamil,Tokenized_Tamil,Tokenized_English,Padded_English,Padded_Tamil,Padded_Tamil_Target
0,1,Lets try something,ஏதாவது முயற்சி செய்வோம்,"[4, 5, 6]","[4, 5, 6]","[2, 4, 5, 6, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 4, 5, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4, 5, 6, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,2,I have to go to sleep,நான் தூங்கப் போக வேண்டும்,"[7, 8, 9, 10]","[7, 8, 9, 10, 9, 11]","[2, 7, 8, 9, 10, 9, 11, 3, 1, 1, 1, 1, 1, 1, 1...","[2, 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[7, 8, 9, 10, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,4,Muiriel is 20 now,முய்ரியலுக்கு இப்போது 20 வயதாகிறது,"[11, 12, 13, 14]","[12, 13, 14, 15]","[2, 12, 13, 14, 15, 3, 1, 1, 1, 1, 1, 1, 1, 1,...","[2, 11, 12, 13, 14, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[11, 12, 13, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
3,6,I will be back soon,விரைவில் திரும்பி வருவேன்,"[15, 16, 17]","[7, 16, 17, 18, 19]","[2, 7, 16, 17, 18, 19, 3, 1, 1, 1, 1, 1, 1, 1,...","[2, 15, 16, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[15, 16, 17, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,7,Im at a loss for words,எனக்கு வார்த்தைகள் வரவில்லை,"[18, 19, 20]","[20, 21, 22, 23, 24, 25]","[2, 20, 21, 22, 23, 24, 25, 3, 1, 1, 1, 1, 1, ...","[2, 18, 19, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[18, 19, 20, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...,...,...
440,490,Blind people sometimes develop a compensatory ...,பார்வையற்றவர்கள் சில நேரங்களில் தங்களைச் சுற்ற...,"[1824, 70, 71, 1825, 855, 1826, 1827, 1491, 18...","[1367, 160, 1368, 1369, 22, 1370, 1228, 9, 137...","[2, 1367, 160, 1368, 1369, 22, 1370, 1228, 9, ...","[2, 1824, 70, 71, 1825, 855, 1826, 1827, 1491,...","[1824, 70, 71, 1825, 855, 1826, 1827, 1491, 18..."
441,491,A miser hoards money not because he is prudent...,ஒரு கஞ்சன் பணத்தைப் பதுக்கி வைப்பது அவன் விவேக...,"[29, 1832, 1833, 1834, 661, 159, 1835, 467, 20...","[884, 1374, 1375, 205, 89, 75, 82, 13, 1376, 2...","[2, 884, 1374, 1375, 205, 89, 75, 82, 13, 1376...","[2, 29, 1832, 1833, 1834, 661, 159, 1835, 467,...","[29, 1832, 1833, 1834, 661, 159, 1835, 467, 20..."
442,498,The witnesses were able to refute the false te...,சந்தேக நபரின் பொய் சாட்சியத்தை சாட்சிகளால் மறு...,"[1752, 1885, 1886, 1887, 1888, 1889, 1890]","[135, 1202, 308, 866, 9, 1415, 41, 1416, 1417,...","[2, 135, 1202, 308, 866, 9, 1415, 41, 1416, 14...","[2, 1752, 1885, 1886, 1887, 1888, 1889, 1890, ...","[1752, 1885, 1886, 1887, 1888, 1889, 1890, 3, ..."
443,499,The guild leader relegated Vince to a lesser o...,கில்ட் தலைவர் வின்ஸை ஒரு குறைந்த அதிகாரியாக மா...,"[1891, 1892, 1893, 29, 1894, 1895, 1896, 1430,...","[135, 1418, 1238, 1419, 1420, 9, 22, 1421, 142...","[2, 135, 1418, 1238, 1419, 1420, 9, 22, 1421, ...","[2, 1891, 1892, 1893, 29, 1894, 1895, 1896, 14...","[1891, 1892, 1893, 29, 1894, 1895, 1896, 1430,..."


In [65]:
import pandas as pd

# Find the midpoint
threshold = int(len(df)*0.8)

# Split into two halves
train = df.iloc[:threshold]
test = df.iloc[threshold:]

print("Train DataFrame:")
print(len(train))
print("\nTest DataFrame:")
print(len(test))



Train DataFrame:
356

Test DataFrame:
89


In [66]:
import torch
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, dataframe, pad_token=1):
        self.dataframe = dataframe
        self.pad_token = pad_token  # Padding value, typically 0

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get the tokenized sequences for English and Tamil
        english =  self.dataframe.iloc[idx]["English"]
        tamil =  self.dataframe.iloc[idx]["Tamil"]
        english_tokens =  torch.tensor(self.dataframe.iloc[idx]["Padded_English"],  dtype=torch.long)  # Shape: (T_english,)
        tamil_tokens = torch.tensor(self.dataframe.iloc[idx]["Padded_Tamil"],  dtype=torch.long)   # Shape: (T_tamil,)
        tamil_target_tokens = torch.tensor(self.dataframe.iloc[idx]["Padded_Tamil_Target"],  dtype=torch.long)   # Shape: (T_tamil_target,)


        def causal_mask(size):
              mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
              return mask == 0
    # Return the sequence and masks in a dictionary
        return {
            "english": english,
            "tamil": tamil,
            "english_token": english_tokens.clone(),
            "tamil_token": tamil_tokens.clone(),
            "tamil_target": tamil_target_tokens.clone(),
            "encoder_mask": (english_tokens != self.pad_token).unsqueeze(0).unsqueeze(0).int().clone(),
            "decoder_mask": (tamil_tokens != self.pad_token).unsqueeze(0).int() & causal_mask(tamil_tokens.size(0)).clone(),

        }

In [73]:
from torch.utils.data import DataLoader

# Assuming you have your Dataset class `TranslationDataset` and DataLoader defined
# Example DataLoader for your dataset
train_dataset = TranslationDataset(train)  # Your dataframe should be defined
train_dataloader = DataLoader(train_dataset, batch_size = 1, shuffle=True)  # Set batch_size as needed
test_dataset = TranslationDataset(test)  # Your dataframe should be defined
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)  # Set batch_size as needed

# Iterate through batches
for batch_idx, batch in enumerate(train_dataloader):
    print(f"Batch {batch_idx + 1}:")

    # Check the shapes of each tensor in the batch
    print(f"  English sequence shape: {batch['english'][0]}")  # Expected: (batch_size, T_english)
    print(f"  Tamil sequence shape: {batch['tamil'][0]}")  # Expected: (batch_size, T_tamil)
    print(f"  English token shape: {batch['english_token'][0]}")  # Expected: (batch_size, T_english)
    print(f"  tamil token shape: {batch['tamil_token'].shape}")  # Expected: (batch_size, T_english)
    print(f"  Tamil target sequence shape: {batch['tamil_target'][0]}")  # Expected: (batch_size, T_tamil_target)
    print(f"  English pad mask shape: {batch['encoder_mask'][0]}")  # Expected: (batch_size, T_english, T_english)
    print(f"  Combined Tamil mask shape: {batch['decoder_mask'][0]}")  # Expected: (batch_size, T_tamil_target, T_tamil_target)
    break
    print("\n")  # Add a newline for better readability between batches

# Iterate through batches
for batch_idx, batch in enumerate(test_dataloader):
    print(f"Batch {batch_idx + 1}:")

    # Check the shapes of each tensor in the batch
    print(f"  English sequence shape: {batch['english'][0]}")  # Expected: (batch_size, T_english)
    print(f"  Tamil sequence shape: {batch['tamil'][0]}")  # Expected: (batch_size, T_tamil)
    print(f"  English token shape: {batch['english_token'][0]}")  # Expected: (batch_size, T_english)
    print(f"  tamil token shape: {batch['tamil_token'].shape}")  # Expected: (batch_size, T_english)
    print(f"  Tamil target sequence shape: {batch['tamil_target'][0]}")  # Expected: (batch_size, T_tamil_target)
    print(f"  English pad mask shape: {batch['encoder_mask'][0]}")  # Expected: (batch_size, T_english, T_english)
    print(f"  Combined Tamil mask shape: {batch['decoder_mask'][0]}")  # Expected: (batch_size, T_tamil_target, T_tamil_target)
    break
    print("\n")  # Add a newline for better readability between batches

Batch 1:
  English sequence shape: I didnt want this to happen
  Tamil sequence shape: இது நடக்க நான் விரும்பவில்லை
  English token shape: tensor([  2,   7, 327,  77,  50,   9,  80,   3,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1])
  tamil token shape: torch.Size([1, 18])
  Tamil target sequence shape: tensor([ 21, 524,   7,  68,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1])
  English pad mask shape: tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]],
       dtype=torch.int32)
  Combined Tamil mask shape: tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0, 0, 0, 

In [74]:
import torch

# Check if CUDA is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [75]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadSelfAttention(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadSelfAttention.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.attn = MultiHeadSelfAttention(d_model, n_heads, dropout)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.attn(x,x,x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(d_model, n_heads, dropout)
        self.enc_attn = MultiHeadSelfAttention(d_model, n_heads, dropout)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        self_attn_output = self.self_attn(x ,x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_output))
        enc_attn_output = self.enc_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(enc_attn_output))
        ff_output = self.ff(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Encoder(nn.Module):
    def __init__(self, input_dim, d_model, n_heads, d_ff, n_layers, dropout=0.1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])

    def forward(self, src, src_mask):
        x = self.embedding(src)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, src_mask)
        return x

class Decoder(nn.Module):
    def __init__(self, output_dim, d_model, n_heads, d_ff, n_layers, dropout=0.1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])
        self.fc_out = nn.Linear(d_model, output_dim)

    def forward(self, tgt, enc_output, src_mask, tgt_mask):
        x = self.embedding(tgt)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)

        return self.fc_out(x)

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, n_heads, d_ff, n_enc_layers, n_dec_layers, dropout=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, d_model, n_heads, d_ff, n_enc_layers, dropout)
        self.decoder = Decoder(tgt_vocab_size, d_model, n_heads, d_ff, n_dec_layers, dropout)

    def forward(self, src, tgt, src_mask, tgt_mask):
        enc_output = self.encoder(src, src_mask)
        output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        return output

src_vocab_size = 1442
tgt_vocab_size = 1917
d_model = 512
n_heads = 8
d_ff = 1024
n_enc_layers = 8
n_dec_layers = 8
dropout = 0.1

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, n_heads, d_ff, n_enc_layers, n_dec_layers, dropout).to(device)


In [76]:
import torch

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
# Assuming 'model' is your PyTorch model
total_params = count_parameters(model)
print(f"Total model parameters: {total_params}")

Total model parameters: 44711805


In [71]:
print("English Vocabulary:", len(english_tokenizer.word_to_id))
print("Tamil Vocabulary:", len(tamil_tokenizer.word_to_id))

English Vocabulary: 1442
Tamil Vocabulary: 1917


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LambdaLR
from tqdm import tqdm
import os
from pathlib import Path
import warnings
from torch.utils.data import random_split
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace



epochs = 30
lr = 10**-4

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx, eos_idx = 2, 3
    encoder_output = model.encoder(source.to(device), source_mask.to(device))
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)

    while decoder_input.size(1) < max_len:
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        out = model.decoder(decoder_input, encoder_output, source_mask, decoder_mask)
        next_word = torch.max(out[:, -1], dim=1)[1]
        decoder_input = torch.cat([decoder_input, next_word.view(1, 1).to(device)], dim=1)
        if next_word == eos_idx: break

    return decoder_input.squeeze(0)


def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, num_examples=1):
    model.to(device)
    model.eval()
    source_texts, expected, predicted = [], [], []
    try:
      console_width = os.get_terminal_size().columns
    except OSError:
      console_width = 80

    with torch.no_grad():
        for count, batch in enumerate(validation_ds, start=1):
            encoder_input, encoder_mask = batch["english_token"].to(device), batch["encoder_mask"].to(device)
            assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"
            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["english"][0]
            target_text = batch["tamil"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            print_msg(f"{'-'*console_width}\n{'SOURCE:':>12}{source_text}\n{'TARGET:':>12}{target_text}\n{'PREDICTED:':>12}{model_out_text}")
            if count == num_examples: break

def train_model():

    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if device == 'cuda':
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")

    optimizer = torch.optim.Adam(model.parameters(), lr = lr, eps=1e-9)

    initial_epoch = 0

    loss_fn = nn.CrossEntropyLoss(ignore_index= 1).to(device)

    for epoch in range(epochs):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            encoder_input = batch['english_token'].to(device)
            decoder_input = batch['tamil_token'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)

            output = model(encoder_input, decoder_input, encoder_mask, decoder_mask)

            label = batch['tamil_target'].to(device)

            loss = loss_fn(output.view(-1, (len(tamil_tokenizer.word_to_id))), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        run_validation(model, test_dataloader, english_tokenizer, tamil_tokenizer, 18, device, lambda msg: batch_iterator.write(msg))

if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    train_model()


Using device: cuda
Device name: Tesla T4
Device memory: 14.74810791015625 GB


Processing Epoch 00: 100%|██████████| 356/356 [00:24<00:00, 14.76it/s, loss=7.588]


--------------------------------------------------------------------------------
     SOURCE:Most people only want to hear their own truth
     TARGET:பெரும்பாலான மக்கள் தங்கள் சொந்த உண்மையை மட்டுமே கேட்க விரும்புகிறார்கள்
  PREDICTED:<SOS> நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான்


Processing Epoch 01: 100%|██████████| 356/356 [00:24<00:00, 14.41it/s, loss=6.281]


--------------------------------------------------------------------------------
     SOURCE:I slept a little during lunch break because I was so tired
     TARGET:நான் மிகவும் சோர்வாக இருந்ததால் மதிய உணவு இடைவேளையின் போது சிறிது தூங்கினேன்
  PREDICTED:<SOS> நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான் நான்


Processing Epoch 02: 100%|██████████| 356/356 [00:24<00:00, 14.75it/s, loss=5.569]


--------------------------------------------------------------------------------
     SOURCE:I dont want to go to school
     TARGET:நான் பள்ளிக்கு செல்ல விரும்பவில்லை
  PREDICTED:<SOS> நான் நான் நான் நான் நான் <EOS>


Processing Epoch 03: 100%|██████████| 356/356 [00:24<00:00, 14.74it/s, loss=6.705]


--------------------------------------------------------------------------------
     SOURCE:What is your greatest source of inspiration
     TARGET:உங்கள் உத்வேகத்தின் மிகப்பெரிய ஆதாரம் என்ன
  PREDICTED:<SOS> <EOS>


Processing Epoch 04: 100%|██████████| 356/356 [00:24<00:00, 14.72it/s, loss=6.513]


--------------------------------------------------------------------------------
     SOURCE:Freuds insights into human behavior led to him being honored as a profound thinker
     TARGET:மனித நடத்தை பற்றிய பிராய்டின் நுண்ணறிவு அவரை ஒரு ஆழ்ந்த சிந்தனையாளராக கௌரவிக்க வழிவகுத்தது
  PREDICTED:<SOS> <EOS>


Processing Epoch 05: 100%|██████████| 356/356 [00:23<00:00, 14.90it/s, loss=5.344]


--------------------------------------------------------------------------------
     SOURCE:Close the door when you leave
     TARGET:நீங்கள் வெளியேறும்போது கதவை மூடு
  PREDICTED:<SOS> <EOS>


Processing Epoch 06: 100%|██████████| 356/356 [00:23<00:00, 14.92it/s, loss=6.475]


--------------------------------------------------------------------------------
     SOURCE:Even people who dont believe in the Catholic church venerate the Pope as a symbolic leader
     TARGET:கத்தோலிக்க திருச்சபையின் மீது நம்பிக்கை இல்லாத மக்கள் கூட போப்பை ஒரு அடையாளத் தலைவராக மதிக்கிறார்கள்
  PREDICTED:<SOS> <EOS>


Processing Epoch 07: 100%|██████████| 356/356 [00:23<00:00, 14.96it/s, loss=6.025]


--------------------------------------------------------------------------------
     SOURCE:If Spenser doesnt keep adding and translating sentences the other contributors will surely surpass him
     TARGET:ஸ்பென்சர் வாக்கியங்களைச் சேர்த்து மொழிபெயர்க்கவில்லை என்றால் மற்ற பங்களிப்பாளர்கள் நிச்சயமாக அவரை மிஞ்சுவார்கள்
  PREDICTED:<SOS> <EOS>


Processing Epoch 08: 100%|██████████| 356/356 [00:23<00:00, 14.96it/s, loss=6.535]


--------------------------------------------------------------------------------
     SOURCE:From the moment that I knew that the university existed Ive wanted to go there
     TARGET:பல்கலைக்கழகம் இருப்பதை அறிந்த தருணத்திலிருந்து நான் அங்கு செல்ல விரும்பினேன்
  PREDICTED:<SOS> <EOS>


Processing Epoch 09: 100%|██████████| 356/356 [00:23<00:00, 14.95it/s, loss=5.103]


--------------------------------------------------------------------------------
     SOURCE:Jason was a taciturn individual so it was always a real surprise when he said anything
     TARGET:ஜேஸன் அதிகம் பேசாமல் இருப்பவர் அதனால் அவர் ஏதாவது சொன்னால் அது எப்போதுமே ஆச்சரியமாகவே இருக்கும்
  PREDICTED:<SOS> <EOS>


Processing Epoch 10: 100%|██████████| 356/356 [00:23<00:00, 15.29it/s, loss=5.235]


--------------------------------------------------------------------------------
     SOURCE:To him hunger was an abstract concept he always had enough to eat
     TARGET:அவரைப் பொறுத்தவரை பசி என்பது ஒரு அருவமான கருத்தாக்கம் அவனுக்கு எப்போதும் போதுமான அளவு சாப்பாடு இருந்தது
  PREDICTED:<SOS> <EOS>


Processing Epoch 11: 100%|██████████| 356/356 [00:23<00:00, 15.15it/s, loss=5.637]


--------------------------------------------------------------------------------
     SOURCE:It requires wisdom to understand wisdom the music is nothing if the audience is deaf
     TARGET:ஞானத்தைப் புரிந்துகொள்ள ஞானம் தேவை பார்வையாளர்கள் காது கேளாதவர்களாக இருந்தால் இசை ஒன்றுமில்லை
  PREDICTED:<SOS> <EOS>


Processing Epoch 12: 100%|██████████| 356/356 [00:23<00:00, 14.97it/s, loss=3.823]


--------------------------------------------------------------------------------
     SOURCE:Too late
     TARGET:மிகவும் தாமதமாக
  PREDICTED:<SOS> <EOS>


Processing Epoch 13: 100%|██████████| 356/356 [00:23<00:00, 14.99it/s, loss=6.306]


--------------------------------------------------------------------------------
     SOURCE:Anything that is too stupid to be spoken is sung
     TARGET:பேச முடியாத முட்டாள்தனமான எதுவும் பாடப்படுகிறது
  PREDICTED:<SOS> <EOS>


Processing Epoch 14: 100%|██████████| 356/356 [00:23<00:00, 14.95it/s, loss=5.586]


--------------------------------------------------------------------------------
     SOURCE:Politicians are always censured for outrageous or inappropriate behavior
     TARGET:அரசியல்வாதிகள் எப்போதும் மூர்க்கத்தனமான அல்லது பொருத்தமற்ற நடத்தைக்காக கண்டிக்கப்படுகிறார்கள்
  PREDICTED:<SOS> <EOS>


Processing Epoch 15:  27%|██▋       | 96/356 [00:06<00:17, 14.63it/s, loss=5.123]

In [None]:
'''"english": english,
"tamil": tamil,
"english_token": english_tokens.clone(),
"tamil_token": tamil_tokens.clone(),
"tamil_target": tamil_target_tokens.clone(),
"encoder_mask": (english_tokens != self.pad_token).unsqueeze(0).unsqueeze(0).int().clone(),
"decoder_mask": (tamil_tokens != self.pad_token).unsqueeze(0).int() & causal_mask(tamil_tokens.size(0)).clone(), '''
