In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shusrith/machine-trainslation")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shusrith/machine-trainslation?dataset_version_number=6...


100%|██████████| 176M/176M [00:01<00:00, 102MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/shusrith/machine-trainslation/versions/6


In [None]:
!ls $path/EnglishOrSpanish

eng_vocab_bpe.json  output1_bpe.csv  output2_bpe.csv  output_bpe.csv  spa_vocab_bpe.json
eng_vocab.json	    output1.csv      output2.csv      output.csv      spa_vocab.json


In [None]:
import pandas as pd

df = pd.read_csv(f"{path}/EnglishOrSpanish/output1_bpe.csv")
df

Unnamed: 0,English,Spanish
0,"[4024, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 491, 23722, 23723, 23723, 23723, 23723..."
1,"[4024, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 10639, 23722, 23723, 23723, 23723, 237..."
2,"[4024, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 23720, 23722, 23723, 23723, 23723, 237..."
3,"[4024, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 23720, 23722, 23723, 23723, 23723, 237..."
4,"[7954, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 23720, 23722, 23723, 23723, 23723, 237..."
...,...,...
132047,"[14450, 462, 14450, 8559, 14450, 1384, 14450, ...","[23720, 932, 23720, 88, 23720, 23720, 23720, 2..."
132048,"[539, 11964, 14450, 5507, 14450, 9330, 64, 103...","[23721, 2719, 23720, 23720, 23720, 281, 23720,..."
132049,"[14450, 5507, 14450, 14450, 14450, 14450, 1445...","[23720, 23720, 312, 23720, 23720, 8327, 88, 23..."
132050,"[14450, 1972, 14450, 14450, 14450, 14450, 1445...","[23720, 268, 568, 281, 23720, 331, 23720, 2372..."


In [None]:
import ast

df["English"] = df["English"].apply(ast.literal_eval)
df["Spanish"] = df["Spanish"].apply(ast.literal_eval)
df

Unnamed: 0,English,Spanish
0,"[4024, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 491, 23722, 23723, 23723, 23723, 23723..."
1,"[4024, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 10639, 23722, 23723, 23723, 23723, 237..."
2,"[4024, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 23720, 23722, 23723, 23723, 23723, 237..."
3,"[4024, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 23720, 23722, 23723, 23723, 23723, 237..."
4,"[7954, 14452, 14453, 14453, 14453, 14453, 1445...","[23721, 23720, 23722, 23723, 23723, 23723, 237..."
...,...,...
132047,"[14450, 462, 14450, 8559, 14450, 1384, 14450, ...","[23720, 932, 23720, 88, 23720, 23720, 23720, 2..."
132048,"[539, 11964, 14450, 5507, 14450, 9330, 64, 103...","[23721, 2719, 23720, 23720, 23720, 281, 23720,..."
132049,"[14450, 5507, 14450, 14450, 14450, 14450, 1445...","[23720, 23720, 312, 23720, 23720, 8327, 88, 23..."
132050,"[14450, 1972, 14450, 14450, 14450, 14450, 1445...","[23720, 268, 568, 281, 23720, 331, 23720, 2372..."


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train = TensorDataset(
    torch.tensor(df["English"][:100000].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][:100000].tolist(), dtype=torch.long),
)
test = TensorDataset(
    torch.tensor(df["English"][100000:].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][100000:].tolist(), dtype=torch.long),
)
train_loader = DataLoader(train, batch_size=512, shuffle=True, drop_last=True)
test_loader = DataLoader(test, batch_size=512, shuffle=False, drop_last=True)

In [None]:
import json
with open(f"{path}/EnglishOrSpanish/eng_vocab_bpe.json", "r") as f:
    eng_vocab = json.load(f)
with open(f"{path}/EnglishOrSpanish/spa_vocab_bpe.json", "r") as f:
    spa_vocab = json.load(f)

In [None]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=4, dropout=0.3):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=14453)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers,
                            batch_first=True, bidirectional=True, dropout=dropout)

        self.layer_norm = nn.LayerNorm(2 * hidden_size)
        self.hidden_norm = nn.LayerNorm(hidden_size)
        self.cell_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.lstm(embedded)
        output = self.layer_norm(output)
        hidden = self._combine_bidirectional(hidden)
        cell = self._combine_bidirectional(cell)
        hidden = self.hidden_norm(hidden)
        cell = self.cell_norm(cell)
        return output, hidden, cell

    def _combine_bidirectional(self, tensor):
        """Combines the final forward and backward hidden states."""
        batch_size = tensor.shape[1]
        tensor = tensor.view(self.num_layers, 2, batch_size, self.hidden_size)
        tensor = tensor[:, 0, :, :] + tensor[:, 1, :, :]
        return tensor


In [None]:
import torch
import torch.nn as nn

class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=4, dropout=0.3):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        self.num_directions = 2

        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=23722)

        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers,
                            batch_first=True, dropout=dropout, bidirectional=True)

        self.layer_norm = nn.LayerNorm(hidden_size * 2)
        self.hidden_norm = nn.LayerNorm(hidden_size)
        self.cell_norm = nn.LayerNorm(hidden_size)

        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        output = self.layer_norm(output)
        hidden = self._combine_bidirectional(hidden)
        cell = self._combine_bidirectional(cell)
        hidden = self.hidden_norm(hidden)
        cell = self.cell_norm(cell)
        output = self.fc(output).squeeze(1)
        return output, hidden, cell

    def _combine_bidirectional(self, tensor):
        """Combines forward and backward hidden/cell states."""
        batch_size = tensor.shape[1]
        tensor = tensor.view(self.num_layers, 2, batch_size, self.hidden_size)
        tensor = tensor[:, 0, :, :] + tensor[:, 1, :, :]
        return tensor


In [None]:
class Seq2Seq():
    def __init__(self, encoder, decoder, device, teacher_forcing_ratio):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.vocab_size = self.decoder.vocab_size
        self.teacher_forcing_ratio = teacher_forcing_ratio


    def train(self, train_loader, test_loader, enc_optimizer, dec_optimizer, criterion, device, encoder_scheduler, decoder_scheduler, num_epochs):
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch+1}/{num_epochs}")
            self.encoder.train()
            self.decoder.train()
            epoch_loss = 0
            epoch_acc = 0
            progress_bar = tqdm(train_loader, desc="Training", leave=False)

            for src, trg in progress_bar:
                src, trg = src.to(device), trg.to(device)

                batch_size, trg_len = trg.shape

                enc_optimizer.zero_grad()
                dec_optimizer.zero_grad()

                _, hidden, cell = self.encoder(src)
                outputs = torch.zeros(batch_size, trg_len, self.vocab_size).to(self.device)
                x = trg[:, 0]
                for t in range(1, trg_len):
                    output, hidden, cell = self.decoder(x, hidden, cell)
                    outputs[:, t, :] = output
                    teacher_force = torch.rand(1).item() < self.teacher_forcing_ratio
                    x = trg[:, t] if teacher_force else output.argmax(dim=1)
                loss = criterion(outputs[:, 1:].reshape(-1, self.vocab_size), trg[:, 1:].reshape(-1))
                loss.backward()
                enc_optimizer.step()
                dec_optimizer.step()

                epoch_loss += loss.item()

                preds = outputs.argmax(dim=2)
                correct = (preds == trg).float().sum().item()
                total = trg.numel()
                batch_acc = correct / total
                epoch_acc += batch_acc

                progress_bar.set_postfix(loss=f"{loss.item():.4f}", acc=f"{batch_acc:.4f}")

            train_loss = epoch_loss / len(train_loader)
            train_acc = epoch_acc / len(train_loader)

            # ---------------- VALIDATION ---------------- #
            self.encoder.eval()
            self.decoder.eval()
            val_epoch_loss = 0
            val_epoch_acc = 0
            progress_bar = tqdm(test_loader, desc="Validating", leave=False)

            with torch.no_grad():
                for src, trg in progress_bar:
                    src, trg = src.to(device), trg.to(device)
                    batch_size, trg_len = trg.shape
                    _, hidden, cell = encoder(src)
                    outputs = torch.zeros(batch_size, trg_len, self.vocab_size).to(self.device)
                    x = trg[:, 0]
                    for t in range(1, trg_len):
                        output, hidden, cell = self.decoder(x, hidden, cell)
                        outputs[:, t, :] = output
                        x = output.argmax(dim=1)

                    loss = criterion(outputs[:, 1:].reshape(-1, self.vocab_size), trg[:, 1:].reshape(-1))
                    val_epoch_loss += loss.item()

                    preds = outputs.argmax(dim=2)
                    correct = (preds == trg).float().sum().item()
                    total = trg.numel()
                    batch_acc = correct / total
                    val_epoch_acc += batch_acc

                    progress_bar.set_postfix(loss=f"{loss.item():.4f}", acc=f"{batch_acc:.4f}")

            val_loss = val_epoch_loss / len(test_loader)
            val_acc = val_epoch_acc / len(test_loader)
            print(f"Train loss : {train_loss}, Train accuracy : {train_acc}, Val_loss : {val_loss}, val accuracy : {val_acc}")
            encoder_scheduler.step(val_loss)
            decoder_scheduler.step(val_loss)
        return train_loss, train_acc, val_loss, val_acc

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

eng_vocab_size = len(eng_vocab) + 1
spa_vocab_size = len(spa_vocab) + 1

hidden_size = 256
encoder = Encoder(eng_vocab_size, hidden_size).to(device)
decoder = Decoder(spa_vocab_size, hidden_size).to(device)
seq2seq = Seq2Seq(encoder, decoder, device, 0.5)
criterion = nn.CrossEntropyLoss(ignore_index=23722)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
encoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, mode='min', factor=0.9, patience=2)
decoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, mode='min', factor=0.9, patience=2)

In [None]:
import torch
import torch.nn.functional as F
from tqdm import tqdm

seq2seq.train(train_loader, test_loader, encoder_optimizer, decoder_optimizer, criterion, device, encoder_scheduler, decoder_scheduler, 20)


Epoch 1/20




Train loss : 2.6090903300505417, Train accuracy : 0.6327254376232742, Val_loss : 2.1514345138303694, val accuracy : 0.5298905668424319

Epoch 2/20




Train loss : 1.1609265419153068, Train accuracy : 0.6595067492603547, Val_loss : 2.0911920724376554, val accuracy : 0.5103956653225806

Epoch 3/20


Training:  16%|█▌        | 31/195 [00:23<02:05,  1.31it/s, acc=0.6660, loss=1.0788]