In [3]:
import os
import json
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from nltk.translate.bleu_score import corpus_bleu
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [4]:
def load_training_data(file_path):
    """Load the training data from the JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    train_data = data['English-Bengali']['Train']
    source_sentences = [entry['source'] for entry in train_data.values()]
    target_sentences = [entry['target'] for entry in train_data.values()]
    return source_sentences, target_sentences

def load_validation_data(file_path):
    """Load the validation data from the JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    val_data = data['English-Bengali']['Test']
    val_sentences = {key: entry['source'] for key, entry in val_data.items()}
    return val_sentences

def save_translations(translations, output_file):
    """Save the translated sentences to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(translations, f, ensure_ascii=False, indent=4)
    print(f"Translations saved to {output_file}")

def pad_sequences(sequences, maxlen, padding_value=0):
    """Pad sequences to the same length."""
    return np.array([seq + [padding_value] * (maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen] for seq in sequences])

def train_bpe_tokenizer(sentences, vocab_size, tokenizer_path):
    """Train a BPE tokenizer."""
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()
    trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
    tokenizer.train_from_iterator(sentences, trainer)
    tokenizer.save(tokenizer_path)
    print(f"Tokenizer saved to {tokenizer_path}")
    return tokenizer

def load_bpe_tokenizer(tokenizer_path):
    """Load a pre-trained BPE tokenizer."""
    return Tokenizer.from_file(tokenizer_path)

def encode_sentences(tokenizer, sentences, max_len):
    """Encode and pad sentences."""
    tokenized = [tokenizer.encode(sentence).ids for sentence in sentences]
    return pad_sequences(tokenized, maxlen=max_len)

In [5]:

# 3. Custom Dataset


class TranslationDataset(Dataset):
    """Custom dataset for machine translation."""
    def __init__(self, source_sentences, target_sentences):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.source_sentences[idx]), torch.tensor(self.target_sentences[idx])

def collate_fn(batch):
    """Collate function for dynamic padding in DataLoader."""
    src, tgt = zip(*batch)
    src = torch.nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=0)
    tgt = torch.nn.utils.rnn.pad_sequence(tgt, batch_first=True, padding_value=0)
    return src, tgt

In [6]:

# 2. Attention Mechanisms


class ScaledDotProductAttention(nn.Module):
    """Scaled Dot-Product Attention"""
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, query, key, value, mask=None):
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention = torch.softmax(scores, dim=-1)
        output = torch.matmul(attention, value)
        return output, attention

class MultiHeadAttention(nn.Module):
    """Multi-Head Attention"""
    def __init__(self, embed_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert embed_size % num_heads == 0, "Embedding size must be divisible by the number of heads"
        self.num_heads = num_heads
        self.head_dim = embed_size // num_heads
        self.query_linear = nn.Linear(embed_size, embed_size)
        self.key_linear = nn.Linear(embed_size, embed_size)
        self.value_linear = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        query = self.query_linear(query)
        key = self.key_linear(key)
        value = self.value_linear(value)

        query = query.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        attention_output, _ = ScaledDotProductAttention()(query, key, value, mask)

        attention_output = attention_output.transpose(1, 2).contiguous()
        attention_output = attention_output.view(batch_size, -1, self.num_heads * self.head_dim)

        output = self.fc_out(attention_output)
        return output



In [7]:

# 3. Transformer Components


class FeedForward(nn.Module):
    """Feedforward Neural Network Layer"""
    def __init__(self, embed_size, ff_hidden_size, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_size, ff_hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(ff_hidden_size, embed_size)

    def forward(self, x):
        return self.linear2(self.dropout(torch.relu(self.linear1(x))))

class PositionalEncoding(nn.Module):
    """Positional Encoding for sequences"""
    def __init__(self, embed_size, max_len=1000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-np.log(10000.0) / embed_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe.unsqueeze(0))  # Store positional encoding on the correct device

    def forward(self, x):
        """Add positional encoding to input tensor."""
        return x + self.pe[:, :x.size(1), :].to(x.device)

class TransformerBlock(nn.Module):
    """Single Transformer Block"""
    def __init__(self, embed_size, num_heads, ff_hidden_size, dropout):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, num_heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = FeedForward(embed_size, ff_hidden_size, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask):
        attention_output = self.attention(query, key, value, mask)
        x = self.norm1(query + self.dropout(attention_output))
        ff_output = self.feed_forward(x)
        return self.norm2(x + self.dropout(ff_output))



In [8]:

# 4. Complete Transformer Model


class Transformer(nn.Module):
    """Complete Transformer Model"""
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size, num_heads, num_layers, ff_hidden_size, dropout=0.1):
        super(Transformer, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size)
        self.encoder_layers = nn.ModuleList([
            TransformerBlock(embed_size, num_heads, ff_hidden_size, dropout) for _ in range(num_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            TransformerBlock(embed_size, num_heads, ff_hidden_size, dropout) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(embed_size, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.positional_encoding(self.src_embedding(src))
        tgt = self.positional_encoding(self.tgt_embedding(tgt))
        for layer in self.encoder_layers:
            src = layer(src, src, src, src_mask)
        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src, tgt_mask)
        return self.fc_out(tgt)



In [9]:

# 5. Training Loop


def train_model(model, dataloader, optimizer, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for src, tgt in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            outputs = model(src, tgt_input)
            loss = criterion(outputs.view(-1, outputs.size(-1)), tgt_output.contiguous().view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


In [24]:

# 6. Evaluation and Inference


def translate_validation_data(model, tokenizer_src, tokenizer_tgt, val_sentences):
    model.eval()
    translations = {}
    with torch.no_grad():
        for key, sentence in tqdm(val_sentences.items(), desc="Translating"):
            encoded_sentence = torch.tensor(tokenizer_src.encode(sentence).ids).unsqueeze(0).to(device)
            translation_ids = greedy_decode(model, encoded_sentence)
            translation = tokenizer_tgt.decode(translation_ids, skip_special_tokens=True)
            translations[key] = translation

            # Print the translation for each sentence as it is processed
            #print(f"Source: {sentence}")
            #print(f"Translation: {translation}")
            #print("-" * 50)
    return translations



In [11]:
def greedy_decode(model, src_sentence, max_len=50, start_token_id=2, end_token_id=3):
    model.eval()

    # Ensure src_sentence is a long tensor
    src_sentence = src_sentence.to(device).long()  # Convert to torch.long

    tgt_sequence = torch.tensor([[start_token_id]], device=device, dtype=torch.long)  # Start token

    with torch.no_grad():
        for _ in range(max_len):
            # Pass the source sentence and target sequence to the model
            output = model(src_sentence, tgt_sequence)

            # Get the token ID with the highest probability for the next position
            next_token_id = output[:, -1, :].argmax(dim=-1).item()

            # Append the new token to the sequence
            tgt_sequence = torch.cat(
                [tgt_sequence, torch.tensor([[next_token_id]], device=device, dtype=torch.long)], dim=1
            )

            # Stop if the end token is generated
            if next_token_id == end_token_id:
                break

    return tgt_sequence.squeeze(0).tolist()  # Return the decoded sequence as a list of token IDs


In [19]:
def translate_validation_file(
    val_file_path,
    output_file_base,
    model,
    tokenizer_src,
    tokenizer_tgt,
    device,
    max_len=50
):
    """Translate validation data and save results."""
    # Load validation data
    val_data = load_validation_data(val_file_path)
    val_df = pd.DataFrame(list(val_data.items()), columns=["id", "source"])

    translated_sentences = []

    for idx, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Translating"):
        text_id = row['id']
        source_sentence = row['source']

        # Translate the sentence using greedy_decode
        src_encoded = torch.tensor(tokenizer_src.encode(source_sentence).ids).unsqueeze(0).to(device)
        translation_ids = greedy_decode(model, src_encoded, max_len=max_len)
        translated_sentence = tokenizer_tgt.decode(translation_ids, skip_special_tokens=True)

        translated_sentences.append({"id": text_id, "translated": translated_sentence})

        # Print progress
        #print(f"[ID: {text_id}] Source: {source_sentence}\nTranslated: {translated_sentence}\n")

    # Create output DataFrame
    output_df = pd.DataFrame(translated_sentences)

    # Save results
    output_df.to_csv(f"{output_file_base}_encoded.csv", index=False, encoding='utf-8')
    output_df.to_csv(f"{output_file_base}_unencoded.csv", index=False, encoding=None)
    output_df.to_json(f"{output_file_base}_encoded.json", orient='records', lines=True, force_ascii=True)
    output_df.to_json(f"{output_file_base}_unencoded.json", orient='records', lines=True, force_ascii=False)

    print(f"Translations saved as:\n"
          f"  CSV (encoded): {output_file_base}_encoded.csv\n"
          f"  CSV (unencoded): {output_file_base}_unencoded.csv\n"
          f"  JSON (encoded): {output_file_base}_encoded.json\n"
          f"  JSON (unencoded): {output_file_base}_unencoded.json")


In [14]:

# 6. Main Workflow


# Paths
train_file = "/content/train_data1.json"

# Step 1: Load training and validation data
source_sentences, target_sentences = load_training_data(train_file)

# Step 2: Tokenize and preprocess
tokenizer_src = train_bpe_tokenizer(source_sentences, 30000, "tokenizer_src.json")
tokenizer_tgt = train_bpe_tokenizer(target_sentences, 30000, "tokenizer_tgt.json")
src_encoded = encode_sentences(tokenizer_src, source_sentences, max_len=50)
tgt_encoded = encode_sentences(tokenizer_tgt, target_sentences, max_len=50)

# Step 3: Train model
model = Transformer(30000, 30000, 256, 8, 6, 2048, 0.1).to(device)
dataloader = DataLoader(TranslationDataset(src_encoded, tgt_encoded), batch_size=32, collate_fn=collate_fn)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0)
train_model(model, dataloader, optimizer, criterion, 10)


Tokenizer saved to tokenizer_src.json
Tokenizer saved to tokenizer_tgt.json


Epoch 1/10: 100%|██████████| 2152/2152 [03:23<00:00, 10.58it/s]


Epoch 1, Loss: 7.8026


Epoch 2/10: 100%|██████████| 2152/2152 [03:29<00:00, 10.26it/s]


Epoch 2, Loss: 6.9923


Epoch 3/10: 100%|██████████| 2152/2152 [03:29<00:00, 10.28it/s]


Epoch 3, Loss: 6.4522


Epoch 4/10: 100%|██████████| 2152/2152 [03:29<00:00, 10.27it/s]


Epoch 4, Loss: 5.9993


Epoch 5/10: 100%|██████████| 2152/2152 [03:28<00:00, 10.31it/s]


Epoch 5, Loss: 5.6031


Epoch 6/10: 100%|██████████| 2152/2152 [03:28<00:00, 10.31it/s]


Epoch 6, Loss: 5.2459


Epoch 7/10: 100%|██████████| 2152/2152 [03:28<00:00, 10.31it/s]


Epoch 7, Loss: 4.9136


Epoch 8/10: 100%|██████████| 2152/2152 [03:28<00:00, 10.31it/s]


Epoch 8, Loss: 4.6023


Epoch 9/10: 100%|██████████| 2152/2152 [03:28<00:00, 10.30it/s]


Epoch 9, Loss: 4.3055


Epoch 10/10: 100%|██████████| 2152/2152 [03:28<00:00, 10.31it/s]

Epoch 10, Loss: 4.0276





In [25]:
# Step 4: Validate and save translations

train_file = "/content/train_data1.json"
val_file = "/content/test_data1_final.json"
val_sentences = load_validation_data(val_file)
translations = translate_validation_data(model, tokenizer_src, tokenizer_tgt, val_sentences)
save_translations(translations, "translated_test_data1.json")

# # Step 5: Translate Validation File (newly integrated)
# translate_validation_file(
#     val_file_path='/content/test_data1_final.json',
#     output_file_base='/content/translated_test',
#     model=model,
#     source_vocab=tokenizer_src.get_vocab(),
#     target_vocab=tokenizer_tgt.get_vocab(),
#     device=device
# )

Translating: 100%|██████████| 19672/19672 [2:15:55<00:00,  2.41it/s]

Translations saved to translated_test_data1.json



