In [None]:
# Step 1: Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Step 2: Load the dataset
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

train_en_path = '/mnt/data/train.en'
train_ur_path = '/mnt/data/train.ur'
train_en_sentences = load_sentences(train_en_path)
train_ur_sentences = load_sentences(train_ur_path)

# Ensure data alignment
assert len(train_en_sentences) == len(train_ur_sentences), "Mismatch in sentence counts"

# Step 3: Preprocess the data (clean and prepare)
def preprocess_lines(lines):
    return [line.replace('\ufeff', '').strip() for line in lines]

train_en_cleaned = preprocess_lines(train_en_sentences)
train_ur_cleaned = preprocess_lines(train_ur_sentences)

# Step 4: Train tokenizers and save them
tokenizer_en = Tokenizer(models.BPE())
trainer_en = trainers.BpeTrainer(special_tokens=["<pad>", "<s>", "</s>", "<unk>"])
tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer_en.train_from_iterator(train_en_cleaned, trainer_en)
tokenizer_en.save("/mnt/data/tokenizer_en.json")

tokenizer_ur = Tokenizer(models.BPE())
trainer_ur = trainers.BpeTrainer(special_tokens=["<pad>", "<s>", "</s>", "<unk>"])
tokenizer_ur.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer_ur.train_from_iterator(train_ur_cleaned, trainer_ur)
tokenizer_ur.save("/mnt/data/tokenizer_ur.json")

# Load tokenizers
tokenizer_en = Tokenizer.from_file("/mnt/data/tokenizer_en.json")
tokenizer_ur = Tokenizer.from_file("/mnt/data/tokenizer_ur.json")

# Step 5: Tokenize the sentences for model input
def tokenize_sentences(tokenizer, sentences, max_length=50):
    encoded = [tokenizer.encode(sentence).ids for sentence in sentences]
    return [seq[:max_length] + [0] * (max_length - len(seq)) for seq in encoded]

X = tokenize_sentences(tokenizer_en, train_en_cleaned)
y = tokenize_sentences(tokenizer_ur, train_ur_cleaned)

# Step 6: Split the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Create a PyTorch Dataset and DataLoader
class TranslationDataset(Dataset):
    def __init__(self, source, target):
        self.source = torch.tensor(source, dtype=torch.long)
        self.target = torch.tensor(target, dtype=torch.long)

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        return self.source[idx], self.target[idx]

train_dataset = TranslationDataset(X_train, y_train)
val_dataset = TranslationDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Step 8: Implement a basic Transformer model (Encoder-Decoder)
class TransformerModel(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim=256, nhead=8, num_layers=3):
        super(TransformerModel, self).__init__()
        self.encoder = nn.Embedding(input_dim, emb_dim)
        self.decoder = nn.Embedding(output_dim, emb_dim)
        self.transformer = nn.Transformer(
            d_model=emb_dim, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers
        )
        self.fc_out = nn.Linear(emb_dim, output_dim)
        self.src_mask = None

    def forward(self, src, tgt):
        src_emb = self.encoder(src)
        tgt_emb = self.decoder(tgt)
        output = self.transformer(src_emb, tgt_emb)
        return self.fc_out(output)

# Step 9: Initialize the model, loss function, and optimizer
input_dim = len(tokenizer_en.get_vocab())
output_dim = len(tokenizer_ur.get_vocab())

model = TransformerModel(input_dim, output_dim)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Step 10: Training loop
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for src, tgt in iterator:
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        output = output.reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Training the model (use a smaller loop for initial testing)
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    print(f'Epoch {epoch + 1}, Training Loss: {train_loss:.4f}')

print("Model training complete!")
