In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shusrith/machine-trainslation")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/


In [2]:
import pandas as pd
from sklearn.utils import shuffle
df = pd.read_csv(f"{path}/EnglishOrSpanish/EnglishOrSpanish/output_joint1.csv")
df = shuffle(df)
df

Unnamed: 0,English,Spanish
32322,"[2, 300, 719, 334, 24105, 609, 224, 3, 0, 0, 0...","[2, 331, 874, 336, 13598, 4587, 224, 3, 0, 0, ..."
107322,"[2, 964, 568, 987, 395, 461, 341, 272, 718, 29...","[2, 302, 664, 761, 316, 7358, 550, 470, 1688, ..."
70484,"[2, 270, 588, 2137, 404, 1291, 2622, 224, 3, 0...","[2, 302, 665, 5163, 589, 4213, 33221, 224, 3, ..."
65539,"[2, 696, 304, 5732, 609, 300, 13745, 224, 3, 0...","[2, 2419, 13374, 321, 2739, 224, 3, 0, 0, 0, 0..."
65750,"[2, 2091, 384, 1598, 398, 290, 224, 3, 0, 0, 0...","[2, 691, 331, 1423, 621, 2526, 265, 290, 224, ..."
...,...,...
96787,"[2, 290, 1346, 718, 10408, 611, 369, 224, 3, 0...","[2, 290, 302, 2003, 4223, 8339, 387, 369, 224,..."
22019,"[2, 565, 384, 1055, 1650, 224, 3, 0, 0, 0, 0, ...","[2, 444, 1819, 2297, 321, 1729, 294, 290, 224,..."
21629,"[2, 1606, 506, 15041, 224, 3, 0, 0, 0, 0, 0, 0...","[2, 746, 1815, 12533, 224, 3, 0, 0, 0, 0, 0, 0..."
61011,"[2, 481, 426, 341, 270, 411, 272, 341, 783, 22...","[2, 292, 444, 2498, 294, 550, 872, 224, 3, 0, ..."


In [3]:
import ast

df["English"] = df["English"].apply(ast.literal_eval)
df["Spanish"] = df["Spanish"].apply(ast.literal_eval)
df

Unnamed: 0,English,Spanish
32322,"[2, 300, 719, 334, 24105, 609, 224, 3, 0, 0, 0...","[2, 331, 874, 336, 13598, 4587, 224, 3, 0, 0, ..."
107322,"[2, 964, 568, 987, 395, 461, 341, 272, 718, 29...","[2, 302, 664, 761, 316, 7358, 550, 470, 1688, ..."
70484,"[2, 270, 588, 2137, 404, 1291, 2622, 224, 3, 0...","[2, 302, 665, 5163, 589, 4213, 33221, 224, 3, ..."
65539,"[2, 696, 304, 5732, 609, 300, 13745, 224, 3, 0...","[2, 2419, 13374, 321, 2739, 224, 3, 0, 0, 0, 0..."
65750,"[2, 2091, 384, 1598, 398, 290, 224, 3, 0, 0, 0...","[2, 691, 331, 1423, 621, 2526, 265, 290, 224, ..."
...,...,...
96787,"[2, 290, 1346, 718, 10408, 611, 369, 224, 3, 0...","[2, 290, 302, 2003, 4223, 8339, 387, 369, 224,..."
22019,"[2, 565, 384, 1055, 1650, 224, 3, 0, 0, 0, 0, ...","[2, 444, 1819, 2297, 321, 1729, 294, 290, 224,..."
21629,"[2, 1606, 506, 15041, 224, 3, 0, 0, 0, 0, 0, 0...","[2, 746, 1815, 12533, 224, 3, 0, 0, 0, 0, 0, 0..."
61011,"[2, 481, 426, 341, 270, 411, 272, 341, 783, 22...","[2, 292, 444, 2498, 294, 550, 872, 224, 3, 0, ..."


In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train = TensorDataset(
    torch.tensor(df["English"][:100000].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][:100000].tolist(), dtype=torch.long),
)
test = TensorDataset(
    torch.tensor(df["English"][100000:].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][100000:].tolist(), dtype=torch.long),
)
train_loader = DataLoader(train, batch_size=512, shuffle=True, drop_last=True)
test_loader = DataLoader(test, batch_size=512, shuffle=False, drop_last=True)

In [5]:
import json
with open(f"{path}/EnglishOrSpanish/EnglishOrSpanish/vocab.json", "r") as f:
    vocab = json.load(f)

In [6]:
import torch
import torch.nn as nn


class Encoder(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, hidden_size, num_layers=2, dropout=0.3
    ):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
        )
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return output, hidden, cell

In [7]:
import torch
import torch.nn as nn


class Decoder(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, hidden_size, num_layers=2, dropout=0.3
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
        )
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.apply(self._init_weights)
        self.dropout = nn.Dropout(dropout)

    def _init_weights(self, module):
        if isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x, hidden, cell):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        output = self.dropout(output)
        output = self.fc(output)
        return output, hidden, cell

In [8]:
import torch.nn as nn
import torch
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, vocab_size, teacher_forcing_ratio=0.5):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.vocab_size = vocab_size
        self.teacher_forcing_ratio = teacher_forcing_ratio

    def forward(self, src, trg):
        _, hidden, cell = self.encoder(src)
        outputs = torch.zeros(trg.shape[0], trg.shape[1], self.decoder.vocab_size).to("cuda")
        x = trg[:, 0]
        for t in range(1, trg.shape[1]):
            output, hidden, cell = self.decoder(x, hidden, cell)
            output = output.squeeze(1)
            outputs[:, t, :] = output
            use_teacher_forcing = random.random() < self.teacher_forcing_ratio
            x = trg[:, t] if use_teacher_forcing else output.argmax(dim=1)

        return outputs
    
    def predict(self, src):
        _, hidden, cell = self.encoder(src)
        outputs = torch.zeros(src.shape[0], src.shape[1], self.decoder.vocab_size).to("cuda")
        x = src[:, 0]
        for i in range(1, src.shape[1]):
            output, hidden, cell = self.decoder(x, hidden, cell)
            output = output.squeeze(1)
            outputs[:, i, :] = output
            x = output.argmax(dim=1)
        
        return outputs

In [9]:
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm


def train(
    model,
    train_loader,
    test_loader,
    optimizer,
    criterion,
    device,
    scheduler,
    num_epochs,
):
    writer = SummaryWriter()

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        model.encoder.train()
        model.decoder.train()
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc="Training", leave=False)

        for src, trg in progress_bar:
            src, trg = src.to(device), trg.to(device)
            optimizer.zero_grad()
            outputs = model(src, trg)
            loss = criterion(
                outputs[:, 1:].reshape(-1, model.vocab_size), trg[:, 1:].reshape(-1)
            )
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            progress_bar.set_postfix(loss=f"{loss.item():.4f}")

        train_loss = epoch_loss / len(train_loader)

        # ---------------- VALIDATION ---------------- #
        model.encoder.eval()
        model.decoder.eval()
        val_epoch_loss = 0
        progress_bar = tqdm(test_loader, desc="Validating", leave=False)

        with torch.no_grad():
            for src, trg in progress_bar:
                src, trg = src.to(device), trg.to(device)
                outputs = model(src, trg)
                loss = criterion(
                    outputs[:, 1:].reshape(-1, model.vocab_size),
                    trg[:, 1:].reshape(-1),
                )
                val_epoch_loss += loss.item()

                progress_bar.set_postfix(loss=f"{loss.item():.4f}")

        val_loss = val_epoch_loss / len(test_loader)
        scheduler.step(val_loss)

        writer.add_scalar("Loss/Train", train_loss, epoch)
        writer.add_scalar("Loss/Validation", val_loss, epoch)
        writer.add_scalar("Learning Rate", scheduler.get_last_lr()[0], epoch)

        print(
            f"Train loss : {train_loss}, Val_loss : {val_loss}, lr: {scheduler.get_last_lr()[0]}"
        )

    writer.close()
    return train_loss, val_loss

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)
embedding_dim = 512
hidden_size = 64
encoder = Encoder(vocab_size, embedding_dim, hidden_size).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_size).to(device)
seq2seq = Seq2Seq(encoder, decoder, vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(seq2seq.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=2
)

In [11]:
import torch
import torch.nn.functional as F
from tqdm import tqdm

train(
    seq2seq,
    train_loader,
    test_loader,
    optimizer,
    criterion,
    device,
    scheduler,
    30,
)


Epoch 1/30


                                                                        

Train loss : 6.687192748143123, Val_loss : 5.654639618737357, lr: 0.001

Epoch 2/30


                                                                        

Train loss : 5.6322522994799495, Val_loss : 5.445596354348319, lr: 0.001

Epoch 3/30


                                                                        

Train loss : 5.444505283160088, Val_loss : 5.27954580954143, lr: 0.001

Epoch 4/30


                                                                        

Train loss : 5.288971583048503, Val_loss : 5.134128902639661, lr: 0.001

Epoch 5/30


                                                                        

Train loss : 5.161596105037591, Val_loss : 5.02119197164263, lr: 0.001

Epoch 6/30


                                                                        

Train loss : 5.0438000679016115, Val_loss : 4.901165851524898, lr: 0.001

Epoch 7/30


                                                                        

Train loss : 4.948218866494986, Val_loss : 4.792365610599518, lr: 0.001

Epoch 8/30


                                                                        

Train loss : 4.861228346213316, Val_loss : 4.702058093888419, lr: 0.001

Epoch 9/30


                                                                        

Train loss : 4.752602577209473, Val_loss : 4.596114567347935, lr: 0.001

Epoch 10/30


                                                                        

Train loss : 4.660875335106483, Val_loss : 4.4997197559901645, lr: 0.001

Epoch 11/30


                                                                        

Train loss : 4.566570790608724, Val_loss : 4.402465045452118, lr: 0.001

Epoch 12/30


                                                                        

Train loss : 4.479857266254914, Val_loss : 4.333659657410213, lr: 0.001

Epoch 13/30


                                                                        

Train loss : 4.400081375317696, Val_loss : 4.237634181976318, lr: 0.001

Epoch 14/30


                                                                        

Train loss : 4.32980751869006, Val_loss : 4.17634516954422, lr: 0.001

Epoch 15/30


                                                                        

Train loss : 4.243819275880472, Val_loss : 4.126359177487237, lr: 0.001

Epoch 16/30


                                                                        

Train loss : 4.178406831545708, Val_loss : 4.059587312596185, lr: 0.001

Epoch 17/30


                                                                        

Train loss : 4.118538665771484, Val_loss : 3.972640429224287, lr: 0.001

Epoch 18/30


                                                                        

Train loss : 4.05635556441087, Val_loss : 3.906147850411279, lr: 0.001

Epoch 19/30


                                                                        

Train loss : 4.010618190276317, Val_loss : 3.8783247896603177, lr: 0.001

Epoch 20/30


                                                                        

Train loss : 3.9558753343728874, Val_loss : 3.82585945725441, lr: 0.001

Epoch 21/30


                                                                        

Train loss : 3.891662178284083, Val_loss : 3.7950908669403622, lr: 0.001

Epoch 22/30


                                                                        

Train loss : 3.87375993117308, Val_loss : 3.7617944606712888, lr: 0.001

Epoch 23/30


                                                                        

Train loss : 3.7953803539276123, Val_loss : 3.7227149648325786, lr: 0.001

Epoch 24/30


                                                                        

Train loss : 3.7668511207287128, Val_loss : 3.6802377743380412, lr: 0.001

Epoch 25/30


                                                                        

Train loss : 3.713581729546571, Val_loss : 3.6232282519340515, lr: 0.001

Epoch 26/30


                                                                        

Train loss : 3.6682197350722094, Val_loss : 3.585292726755142, lr: 0.001

Epoch 27/30


                                                                        

Train loss : 3.623416779591487, Val_loss : 3.584328361919948, lr: 0.001

Epoch 28/30


                                                                        

Train loss : 3.6013498257368037, Val_loss : 3.5853850288050517, lr: 0.001

Epoch 29/30


                                                                        

Train loss : 3.5387104499034394, Val_loss : 3.490789668900626, lr: 0.001

Epoch 30/30


                                                                        

Train loss : 3.518312445665017, Val_loss : 3.470277441399438, lr: 0.001




(3.518312445665017, 3.470277441399438)

In [12]:
torch.save(seq2seq.state_dict(), "seq2seq-noattention.pth")

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)
embedding_dim = 512
hidden_size = 64
encoder = Encoder(vocab_size, embedding_dim, hidden_size).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_size).to(device)
seq2seq = Seq2Seq(encoder, decoder, vocab_size).to(device)
seq2seq.load_state_dict(torch.load("seq2seq-noattention.pth", weights_only=True))

<All keys matched successfully>

In [14]:
seq2seq.eval()
# with torch.no_grad():
a, b = next(iter(train_loader))
a = a.to(device)
b = b.to(device)
with torch.no_grad():
    x = seq2seq.predict(a)

x = x.argmax(dim=2)
x.shape

torch.Size([512, 18])

In [15]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer(
    f"{path}/EnglishOrSpanish/EnglishOrSpanish/vocab.json", f"{path}/EnglishOrSpanish/EnglishOrSpanish/merges.txt"
)

In [16]:
print(tokenizer.decode(b.tolist()[2]))
print(tokenizer.decode(x.tolist()[2]))

<SOS> tengo malas noticias para vosotros <EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<PAD> tengo mucho de por <EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS> <EOS><EOS><EOS>
