In [3]:
# Import required libraries for English-Hindi transformer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import os

# Check if CUDA is available
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# List files in current directory
print("\nFiles in current directory:")
for filename in os.listdir('.'):
    print(filename)

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA device: Tesla P100-PCIE-16GB

Files in current directory:
.virtual_documents


In [4]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("preetviradiya/english-hindi-dataset")

# print("Path to dataset files:", path)

In [5]:
# # dataset 2
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("vaibhavkumar11/hindi-english-parallel-corpus")

# print("Path to dataset files:", path)

In [6]:
# Load dataset - compatible with both local and Kaggle environments
import os

# Check if running on Kaggle or locally
if os.path.exists('/kaggle/input'):
    # Running on Kaggle
    print("Running on Kaggle environment")
    # data_path = '/kaggle/input/english-hindi-dataset/Dataset_English_Hindi.csv'
    data_path = '/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv'
else:
    # Running locally
    print("Running on local environment")
    data_path = 'Dataset_English_Hindi.csv'

print(f"Loading data from: {data_path}")
data = pd.read_csv(data_path)
print(f"Dataset loaded successfully! Shape: {data.shape}")

Running on Kaggle environment
Loading data from: /kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv
Dataset loaded successfully! Shape: (1561841, 2)


In [7]:
data.head(5)

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [8]:
data.drop_duplicates(inplace = True)
data.dropna(inplace = True)

In [9]:
data['english'] = data['english'].str.lower()

In [10]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PAD>'
END_TOKEN = '<END>'
UNK_TOKEN = '<UNK>'

hindi_vocabulary = [
    START_TOKEN, UNK_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', '<', '=', '>', '?', '@', '|', '।',
    '०', '१', '२', '३', '४', '५', '६', '७', '८', '९',
    'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ॠ', 'ऌ', 'ॡ', 'ए', 'ऐ', 'ओ', 'औ',
    'क', 'ख', 'ग', 'घ', 'ङ',
    'च', 'छ', 'ज', 'झ', 'ञ',
    'ट', 'ठ', 'ड', 'ढ', 'ण',
    'त', 'थ', 'द', 'ध', 'न',
    'प', 'फ', 'ब', 'भ', 'म',
    'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह',
    'ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्',
    'क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', 'य़', '…' ,'—',
    PADDING_TOKEN, END_TOKEN
]

english_vocabulary = [
    START_TOKEN, UNK_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', '<', '=', '>', '?', '@', '|',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    '[', '\\', ']', '^', '_', '`',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
    '{', '|', '}', '~','…' ,'—',
    PADDING_TOKEN, END_TOKEN
]


In [11]:
data.shape

(1353877, 2)

In [12]:
english_file = data['english']
hindi_file = data['hindi']

In [13]:
index_to_hindi = {k:v for k,v in enumerate(hindi_vocabulary)}
hindi_to_index = {v:k for k,v in enumerate(hindi_vocabulary)}

index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [14]:

english_sentence = english_file
hindi_sentence = hindi_file
english_sentence = [sentence.rstrip('\n') for sentence in english_sentence]
hindi_sentence = [sentence.rstrip('\n') for sentence in hindi_sentence]

In [15]:
hindi_sentence[:10]

['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
 'एक्सेर्साइसर पहुंचनीयता अन्वेषक',
 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका',
 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका',
 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है',
 'अवधि को हाइलाइट रकें',
 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि',
 'सीमांत (बोर्डर) के रंग को हाइलाइट करें',
 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। ',
 'भराई के रंग को हाइलाइट करें']

In [16]:
max(len(x) for x in english_sentence) , max(len(x) for x in hindi_sentence)

(11088, 8000)

In [17]:
PERCENTAGE = 98
print(f"{PERCENTAGE}th length of hindi word :{np.percentile([len(x) for x in hindi_sentence], PERCENTAGE)}")
print(f"{PERCENTAGE}th length of english word :{np.percentile([len(x) for x in english_sentence], PERCENTAGE)}")

98th length of hindi word :318.0
98th length of english word :321.0


In [18]:
max_sequence_length = 300 #increase it afterward


def is_valid_tokens(sentence, vocab):
    # Check if all tokens in the sentence are in the vocabulary
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    # Check if the sentence length is within the allowed limit
    return len(list(sentence)) < (max_sequence_length - 1)

valid_sentence_indices = []
for index in range(len(hindi_sentence)):
    hindi_sent = hindi_sentence[index]
    english_sent = english_sentence[index]
    if (is_valid_length(hindi_sent, max_sequence_length)
        and is_valid_length(english_sent, max_sequence_length)
        and is_valid_tokens(hindi_sent, hindi_vocabulary)):
        valid_sentence_indices.append(index)


print(f"Number of sentences: {len(hindi_sentence)}")
print(f"Number of valid sentences: {len(valid_sentence_indices)}")


Number of sentences: 1353877
Number of valid sentences: 767441


In [19]:
# Add validation and debugging functions
def validate_vocabulary_coverage(sentences, vocabulary, language_name):
    """Check if all characters in sentences are covered by vocabulary"""
    all_chars = set()
    for sentence in sentences:
        all_chars.update(set(sentence))
    
    missing_chars = all_chars - set(vocabulary)
    if missing_chars:
        print(f"Missing characters in {language_name} vocabulary: {missing_chars}")
        return False
    return True

def clean_sentences(sentences, vocabulary, unk_token):
    """Replace unknown characters with UNK token"""
    cleaned = []
    vocab_set = set(vocabulary)
    for sentence in sentences:
        cleaned_sentence = ""
        for char in sentence:
            if char in vocab_set:
                cleaned_sentence += char
            else:
                cleaned_sentence += unk_token
        cleaned.append(cleaned_sentence)
    return cleaned

# Validate vocabulary coverage
print("Validating vocabulary coverage...")
hindi_valid = validate_vocabulary_coverage(hindi_sentence, hindi_vocabulary, "Hindi")
english_valid = validate_vocabulary_coverage(english_sentence, english_vocabulary, "English")

if not hindi_valid:
    print("Cleaning Hindi sentences...")
    hindi_sentence = clean_sentences(hindi_sentence, hindi_vocabulary, UNK_TOKEN)

if not english_valid:
    print("Cleaning English sentences...")
    english_sentence = clean_sentences(english_sentence, english_vocabulary, UNK_TOKEN)

Validating vocabulary coverage...
Missing characters in Hindi vocabulary: {'Ô', '¡', '²', '€', 'а', '\u2003', ';', 'é', '•', '¥', '×', 'آ', '中', 'ت', 'Π', 'þ', '⁶', '⁰', 'θ', 'u', 'w', '⌊', 'র', '॒', '2', 'L', 'ö', '”', '∞', '设', '￼', '॰', '♫', '\x89', 'Ż', '॔', 'Ç', '–', '―', 'č', 'I', 'ુ', '‘', 'β', '́', '̀', 'µ', 'Ň', '˙', '’', 'ţ', '\x98', '0', '़', '⁵', 'У', 'ॅ', '≤', 'দ', '،', 'd', '\u200d', '∂', 'ॊ', '漢', 'ü', 'G', 'и', 'か', 'Ś', '\x81', 'b', '\uf00f', '¶', '°', 'ā', '字', '\x80', 'ॐ', 'Ă', '\x8a', '\x86', 'M', 'Æ', 'j', '˚', 'р', '⅗', '´', 'ॄ', '⅓', '֦', 'ś', 'Ě', '̈', '⅕', '₂', 'λ', 'ç', '→', '˝', '√', 'ی', 'ď', 'ů', 'ù', 'ृ', '\x88', '̄', 'Ð', '\x9b', '\x9f', 'ج', '\x87', 'ॉ', 'া', 'ř', '³', 'Ĺ', 'ق', 'Θ', '6', '5', '̪', 'Ą', 's', '∕', '¤', 'ж', '4', 'º', 'ٓ', '₁', 'α', '\x94', 'Ω', '\x9a', '\x93', 'h', 'ر', 't', '8', 'c', 'N', '☺', 'ী', 'ক', 'Ť', '{', '\x83', 'Ó', 'ι', 'μ', 'R', '⌈', 'q', 'ω', '£', 'x', 'i', 'κ', 'е', '“', 'م', '⏎', 'k', 'ু', 'о', 'm', 'ਿ', '¹', '\uf0a7', 'ʼ'

In [20]:
hindi_sentence = [hindi_sentence[i] for i in valid_sentence_indices]
english_sentence = [english_sentence[i] for i in valid_sentence_indices]

In [21]:
print(len(hindi_sentence))
print(len(english_sentence))

767441
767441


In [22]:

from torch.utils.data import Dataset, DataLoader


class TextDataset(Dataset):
    def __init__(self, english_sentence, hindi_sentence):
        self.english_sentence = english_sentence
        self.hindi_sentence = hindi_sentence

    def __len__(self):
        return len(self.english_sentence)

    def __getitem__(self, index):
        english_sentence = self.english_sentence[index]
        hindi_sentence = self.hindi_sentence[index]
        return english_sentence, hindi_sentence



In [23]:
dataset = TextDataset(english_sentence,hindi_sentence)

In [24]:
batch_size = 3
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [25]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('give your application an accessibility workout', 'accerciser accessibility explorer', 'the default plugin layout for the bottom panel'), ('अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें', 'एक्सेर्साइसर पहुंचनीयता अन्वेषक', 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका')]
[('the default plugin layout for the top panel', 'a list of plugins that are disabled by default', 'highlight duration'), ('ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका', 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है', 'अवधि को हाइलाइट रकें')]
[('the duration of the highlight box when selecting accessible nodes', 'highlight border color', 'the color and opacity of the highlight border.'), ('पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि', 'सीमांत (बोर्डर) के रंग को हाइलाइट करें', 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। ')]
[('highlight fill color', 'the color and opacity of the highlight fill.', 'api browser'), ('भराई के रंग को हाइलाइट करें', 'हाइलाइट किया गया भराई का रंग और पारदर्शिता। ', 'एपी

In [26]:
def tokenize(sentence, language_to_index, start_token=True, end_token=True):
    sentence_word_indicies = [language_to_index[token] for token in list(sentence)]
    if start_token:
        sentence_word_indicies.insert(0, language_to_index[START_TOKEN])
    if end_token:
        sentence_word_indicies.append(language_to_index[END_TOKEN])
    for _ in range(len(sentence_word_indicies), max_sequence_length):
        sentence_word_indicies.append(language_to_index[PADDING_TOKEN])
    return torch.tensor(sentence_word_indicies)

In [27]:
batch

[('browse the various methods of the current accessible',
  'hide private attributes',
  'method'),
 ('इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें',
  'निजी गुणों को छिपाएं',
  'विधि')]

In [28]:
eng_tokenized, hin_tokenized = [], []
for sentence_num in range(batch_size):
    eng_sentence, hin_sentence = batch[0][sentence_num], batch[1][sentence_num]
    eng_tokenized.append( tokenize(eng_sentence, english_to_index, start_token=False, end_token=False) )
    hin_tokenized.append( tokenize(hin_sentence, hindi_to_index, start_token=True, end_token=True) )
eng_tokenized = torch.stack(eng_tokenized)
hin_tokenized = torch.stack(hin_tokenized)

### Transformer

In [29]:
import numpy as np
import torch
import math
from torch import nn
import torch.nn.functional as F

def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self, device):
        even_i = torch.arange(0, self.d_model, 2).float().to(device)
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length, device=device)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

class SentenceEmbedding(nn.Module):
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token, end_token):
        # Get the device from the embedding layer
        device = next(self.parameters()).device

        def tokenize(sentence, start_token, end_token):
            sentence_word_indicies = []
            for token in list(sentence):
                if token in self.language_to_index:
                    sentence_word_indicies.append(self.language_to_index[token])
                else:
                    # Handle unknown tokens
                    sentence_word_indicies.append(self.language_to_index.get('<UNK>', 1))
            
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            
            # Truncate if too long
            if len(sentence_word_indicies) > self.max_sequence_length:
                sentence_word_indicies = sentence_word_indicies[:self.max_sequence_length-1]
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            
            # Pad to max length
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            
            # Validate indices
            for idx in sentence_word_indicies:
                if idx >= self.vocab_size or idx < 0:
                    raise ValueError(f"Invalid token index: {idx}")
            
            return torch.tensor(sentence_word_indicies, device=device)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized)
        return tokenized

    def forward(self, x, start_token, end_token): # sentence
        device = next(self.parameters()).device
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder(device)
        x = self.dropout(x + pos)
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out


class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x

class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

class Encoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, self_attention_mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers(x, self_attention_mask)
        return x


class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size()
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out


class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)

        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)

        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y


class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

class Decoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token):
        y = self.sentence_embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y


class Transformer(nn.Module):
    def __init__(self,
                d_model,
                ffn_hidden,
                num_heads,
                drop_prob,
                num_layers,
                max_sequence_length,
                hin_vocab_size,
                english_to_index,
                hindi_to_index,
                START_TOKEN,
                END_TOKEN,
                PADDING_TOKEN
                ):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, hin_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=False,
                dec_end_token=False):
        x = self.encoder(x, encoder_self_attention_mask, start_token=enc_start_token, end_token=enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out)
        return out

### Translation 

In [30]:

d_model = 512
batch_size = 30
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 3 #increase it afterward
max_sequence_length = 300 #increase it afterward
hin_vocab_size = len(hindi_vocabulary)

transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          hin_vocab_size,
                          english_to_index,
                          hindi_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

In [31]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(100, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (attention): MultiHeadA

In [32]:
dataset = TextDataset(english_sentence, hindi_sentence)


In [33]:
len(dataset)


767441

In [34]:
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [35]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index = hindi_to_index[PADDING_TOKEN],
                                reduction='none')

for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [36]:
NEG_INFTY = -1e9

def create_masks(eng_batch, hin_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length], True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length], False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length], False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length], False)

    for idx in range(num_sentences):
        eng_sentence_length, hin_sentence_length = len(eng_batch[idx]), len(hin_batch[idx])
        eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
        hin_chars_to_padding_mask = np.arange(hin_sentence_length + 1, max_sequence_length)
        encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
        encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
        decoder_padding_mask_self_attention[idx, :, hin_chars_to_padding_mask] = True
        decoder_padding_mask_self_attention[idx, hin_chars_to_padding_mask, :] = True
        decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
        decoder_padding_mask_cross_attention[idx, hin_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask = torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [38]:
# CUDA-optimized training with proper device management
print("Available device:", get_device())

# Use CUDA if available, otherwise CPU
device = get_device()
print(f"Using device: {device}")

# Use a smaller subset for initial testing
print("Using a smaller subset for faster training...")
dataset_size =  min(10000, len(dataset))  #use like size:batch = 10000,32 or 50000,64 etc if all(list(range(total_size)))
subset_indices = torch.randperm(len(dataset))[:dataset_size]
subset_english = [english_sentence[i] for i in subset_indices]
subset_hindi = [hindi_sentence[i] for i in subset_indices]
test_dataset = TextDataset(subset_english, subset_hindi)

# Use smaller batch size for testing
test_batch_size = 64
test_loader = DataLoader(test_dataset, test_batch_size)

print("Testing model setup...")
transformer.train()

# Move model to device FIRST
transformer.to(device)
print(f"Model moved to {device}")

# Try a small test first
test_iterator = iter(test_loader)
test_batch = next(test_iterator)

try:
    eng_batch, hin_batch = test_batch
    print(f"Test English sentence: {eng_batch[0][:50]}...")
    print(f"Test Hindi sentence: {hin_batch[0][:50]}...")
    
    # Test tokenization
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, hin_batch)
    
    with torch.no_grad():
        test_output = transformer(eng_batch,
                                 hin_batch,
                                 encoder_self_attention_mask.to(device),
                                 decoder_self_attention_mask.to(device),
                                 decoder_cross_attention_mask.to(device),
                                 enc_start_token=False,
                                 enc_end_token=False,
                                 dec_start_token=True,
                                 dec_end_token=True)
    
    print("Device test successful! Model output shape:", test_output.shape)
    print(f"Training on {device}")
    
except Exception as e:
    print(f"Error during testing: {e}")
    print("Falling back to CPU...")
    device = torch.device('cpu')
    transformer.to(device)

# Training loop with proper device management
total_loss = 0
num_epochs = 20 # Start with just 2 epochs for testing
hin_vocab_size = len(hindi_vocabulary)

print(f"Starting training on {device}")
print(f"Vocab sizes - English: {len(english_vocabulary)}, Hindi: {len(hindi_vocabulary)}")
print(f"Dataset size: {len(test_dataset)} samples")

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    iterator = iter(test_loader)
    epoch_loss = 0
    batch_count = 0
    
    for batch_num, batch in enumerate(iterator):
        try:
            transformer.train()
            eng_batch, hin_batch = batch
            
            # Validate batch
            if len(eng_batch) == 0 or len(hin_batch) == 0:
                continue
                
            # Create masks and move to device
            encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, hin_batch)
            encoder_self_attention_mask = encoder_self_attention_mask.to(device)
            decoder_self_attention_mask = decoder_self_attention_mask.to(device)
            decoder_cross_attention_mask = decoder_cross_attention_mask.to(device)
            
            optim.zero_grad()
            
            hin_predictions = transformer(eng_batch,
                                         hin_batch,
                                         encoder_self_attention_mask,
                                         decoder_self_attention_mask,
                                         decoder_cross_attention_mask,
                                         enc_start_token=False,
                                         enc_end_token=False,
                                         dec_start_token=True,
                                         dec_end_token=True)
            
            labels = transformer.decoder.sentence_embedding.batch_tokenize(hin_batch, start_token=False, end_token=True)
            
            loss = criterian(
                hin_predictions.view(-1, hin_vocab_size),
                labels.view(-1)
            )
            
            valid_indicies = torch.where(labels.view(-1) == hindi_to_index[PADDING_TOKEN], False, True)
            loss = loss.sum() / valid_indicies.sum()
            
            loss.backward()
            optim.step()
            
            epoch_loss += loss.item()
            batch_count += 1
            
            if batch_num % 100 == 0:  # Show progress every 10,100,1000batches
                print(f"  Batch {batch_num}: Loss = {loss.item():.4f}")
                print(f"  English: {eng_batch[0][:30]}...")
                print(f"  Hindi: {hin_batch[0][:30]}...")
                
                # Show prediction for first sentence
                hin_sentence_predicted = torch.argmax(hin_predictions[0], axis=1)
                predicted_sentence = ""
                for idx in hin_sentence_predicted:
                  if idx == hindi_to_index[END_TOKEN]:
                    break
                  predicted_sentence += index_to_hindi[idx.item()]
                print(f"  Prediction: {predicted_sentence[:30]}...")
                print("  " + "-" * 40)
        
        except Exception as e:
            print(f"Error in batch {batch_num}: {e}")
            continue
    
    avg_loss = epoch_loss / max(batch_count, 1)
    print(f"Epoch {epoch + 1} completed. Average loss: {avg_loss:.4f}")
    print("=" * 50)

print("\nTraining completed!")
print("You can now test the model or continue training with more epochs and larger datasets.")

# Simple translation test
print("\n" + "="*60)
print("TESTING THE MODEL")
print("="*60)

transformer.eval()
with torch.no_grad():
    test_sentence = "hello how are you"
    print(f"Input: {test_sentence}")
    
    # Generate translation
    hin_sentence = ("",)
    eng_sentence = (test_sentence,)
    
    for word_counter in range(min(50, max_sequence_length)):
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_sentence, hin_sentence)
        predictions = transformer(eng_sentence,
                                  hin_sentence,
                                  encoder_self_attention_mask.to(device),
                                  decoder_self_attention_mask.to(device),
                                  decoder_cross_attention_mask.to(device),
                                  enc_start_token=False,
                                  enc_end_token=False,
                                  dec_start_token=True,
                                  dec_end_token=False)
        next_token_prob_distribution = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_hindi[next_token_index]
        hin_sentence = (hin_sentence[0] + next_token, )
        if next_token == END_TOKEN:
            break
    
    print(f"Translation: {hin_sentence[0]}")
    print("="*60)

Available device: cuda
Using device: cuda
Using a smaller subset for faster training...
Testing model setup...
Model moved to cuda
Test English sentence: indeed the companions of paradise, that day, will ...
Test Hindi sentence: निश्चय ही जन्नतवाले आज किसी न किसी काम नें व्यस्त ...
Device test successful! Model output shape: torch.Size([64, 300, 108])
Training on cuda
Starting training on cuda
Vocab sizes - English: 101, Hindi: 108
Dataset size: 10000 samples

Epoch 1/20
  Batch 0: Loss = 5.7907
  English: indeed the companions of parad...
  Hindi: निश्चय ही जन्नतवाले आज किसी न ...
  Prediction: ''''ङ'ङ''ङङङ'ङ''ङङङङङङ''ङ'ङङ''...
  ----------------------------------------
Error in batch 8: Invalid token index: 100
  Batch 100: Loss = 3.3982
  English: the uncle and nephew were guru...
  Hindi: चाचा और भतीजा वास्तविक अर्थों ...
  Prediction:                               ...
  ----------------------------------------
Error in batch 145: Invalid token index: 100
Epoch 1 completed. Average

## saving model

In [39]:
# Save the trained model
import pickle
import json
from datetime import datetime

def save_model(model, model_path="/kaggle/working/english_hindi_transformer_10000val_20epoch.pth", config_path="model_config.json"):
    """
    Save the trained transformer model and its configuration
    """
    print("Saving model...")
    
    # Save model state dict
    torch.save({
        'model_state_dict': model.state_dict(),
        'model_config': {
            'd_model': d_model,
            'ffn_hidden': ffn_hidden,
            'num_heads': num_heads,
            'drop_prob': drop_prob,
            'num_layers': num_layers,
            'max_sequence_length': max_sequence_length,
            'hin_vocab_size': hin_vocab_size,
        },
        'vocabularies': {
            'hindi_vocabulary': hindi_vocabulary,
            'english_vocabulary': english_vocabulary,
            'hindi_to_index': hindi_to_index,
            'english_to_index': english_to_index,
            'index_to_hindi': index_to_hindi,
            'index_to_english': index_to_english,
        },
        'tokens': {
            'START_TOKEN': START_TOKEN,
            'END_TOKEN': END_TOKEN,
            'PADDING_TOKEN': PADDING_TOKEN,
            'UNK_TOKEN': UNK_TOKEN,
        },
        'training_info': {
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'num_epochs_trained': num_epochs,
            'final_loss': avg_loss if 'avg_loss' in locals() else None,
        }
    }, model_path)
    
    print(f"Model saved to: {model_path}")
    print("Saved components:")
    print("- Model state dict")
    print("- Model configuration")
    print("- Vocabularies and mappings")
    print("- Special tokens")
    print("- Training information")

# Save the model
save_model(transformer)
print("Model saved successfully!")

Saving model...
Model saved to: /kaggle/working/english_hindi_transformer_10000val_20epoch.pth
Saved components:
- Model state dict
- Model configuration
- Vocabularies and mappings
- Special tokens
- Training information
Model saved successfully!


In [None]:
# Load and use the saved model for translation
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math

def load_model(model_path="/kaggle/working/english_hindi_transformer_fullset_100epoch.pth"):
    """
    Load the saved transformer model and its configuration
    """
    print("Loading model...")
    
    # Load the saved data
    checkpoint = torch.load(model_path, map_location='cpu')
    
    # Extract configuration
    config = checkpoint['model_config']
    vocabularies = checkpoint['vocabularies']
    tokens = checkpoint['tokens']
    
    # Recreate the model architecture (copy the classes from above)
    # You need to make sure all the transformer classes are defined before calling this
    
    # Create model instance
    model = Transformer(
        d_model=config['d_model'],
        ffn_hidden=config['ffn_hidden'],
        num_heads=config['num_heads'],
        drop_prob=config['drop_prob'],
        num_layers=config['num_layers'],
        max_sequence_length=config['max_sequence_length'],
        hin_vocab_size=config['hin_vocab_size'],
        english_to_index=vocabularies['english_to_index'],
        hindi_to_index=vocabularies['hindi_to_index'],
        START_TOKEN=tokens['START_TOKEN'],
        END_TOKEN=tokens['END_TOKEN'],
        PADDING_TOKEN=tokens['PADDING_TOKEN']
    )
    
    # Load the state dict
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # Set to evaluation mode
    model.eval()
    
    print("Model loaded successfully!")
    print(f"Training info: {checkpoint['training_info']}")
    
    return model, vocabularies, tokens, config

def translate_sentence(model, english_sentence, vocabularies, tokens, config, device='cpu'):
    """
    Translate an English sentence to Hindi using the loaded model
    """
    model.to(device)
    model.eval()
    
    # Get vocabularies and tokens
    hindi_to_index = vocabularies['hindi_to_index']
    english_to_index = vocabularies['english_to_index']
    index_to_hindi = vocabularies['index_to_hindi']
    START_TOKEN = tokens['START_TOKEN']
    END_TOKEN = tokens['END_TOKEN']
    PADDING_TOKEN = tokens['PADDING_TOKEN']
    max_sequence_length = config['max_sequence_length']
    
    # Create masks function (simplified version)
    def create_simple_masks(eng_sentence, hin_sentence, max_len):
        NEG_INFTY = -1e9
        num_sentences = 1
        
        look_ahead_mask = torch.full([max_len, max_len], True)
        look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
        
        encoder_padding_mask = torch.full([num_sentences, max_len, max_len], False)
        decoder_padding_mask_self_attention = torch.full([num_sentences, max_len, max_len], False)
        decoder_padding_mask_cross_attention = torch.full([num_sentences, max_len, max_len], False)
        
        eng_sentence_length = len(eng_sentence)
        hin_sentence_length = len(hin_sentence)
        
        eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_len)
        hin_chars_to_padding_mask = np.arange(hin_sentence_length + 1, max_len)
        
        encoder_padding_mask[0, :, eng_chars_to_padding_mask] = True
        encoder_padding_mask[0, eng_chars_to_padding_mask, :] = True
        decoder_padding_mask_self_attention[0, :, hin_chars_to_padding_mask] = True
        decoder_padding_mask_self_attention[0, hin_chars_to_padding_mask, :] = True
        decoder_padding_mask_cross_attention[0, :, eng_chars_to_padding_mask] = True
        decoder_padding_mask_cross_attention[0, hin_chars_to_padding_mask, :] = True
        
        encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
        decoder_self_attention_mask = torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
        decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
        
        return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask
    
    print(f"Translating: '{english_sentence}'")
    
    with torch.no_grad():
        # Generate translation
        hin_sentence = ""
        eng_sentence_tuple = (english_sentence.lower(),)
        
        for word_counter in range(min(100, max_sequence_length)):
            hin_sentence_tuple = (hin_sentence,)
            
            # Create masks
            encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_simple_masks(
                english_sentence, hin_sentence, max_sequence_length
            )
            
            # Get predictions
            predictions = model(
                eng_sentence_tuple,
                hin_sentence_tuple,
                encoder_self_attention_mask.to(device),
                decoder_self_attention_mask.to(device),
                decoder_cross_attention_mask.to(device),
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=True,
                dec_end_token=False
            )
            
            # Get next token
            next_token_prob_distribution = predictions[0][word_counter]
            next_token_index = torch.argmax(next_token_prob_distribution).item()
            next_token = index_to_hindi[next_token_index]
            
            # Add token to sentence
            hin_sentence += next_token
            
            # Check for end token
            if next_token == END_TOKEN:
                break
    
    return hin_sentence.replace(END_TOKEN, "").strip()

# Example usage - Load model and translate
try:
    # Load the model
    loaded_model, loaded_vocabularies, loaded_tokens, loaded_config = load_model()
    
    # Test translations
    test_sentences = [
        "hello how are you",
        "what is your name",
        "good morning",
        "thank you very much",
        "i love you"
    ]
    
    print("\n" + "="*60)
    print("TESTING LOADED MODEL")
    print("="*60)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    for sentence in test_sentences:
        translation = translate_sentence(loaded_model, sentence, loaded_vocabularies, loaded_tokens, loaded_config, device)
        print(f"English: {sentence}")
        print(f"Hindi: {translation}")
        print("-" * 40)
        
except FileNotFoundError:
    print("Model file not found. Please train and save the model first.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Make sure to run the training cell first to create the model.")