In [1]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("shusrith/machine-trainslation")

# print("Path to dataset files:", path)

In [2]:
import pandas as pd
from sklearn.utils import shuffle

df = pd.read_csv(f"EnglishOrSpanish/output_joint1.csv")
df = shuffle(df)
df

Unnamed: 0,English,Spanish
12646,"[2, 744, 700, 606, 224, 3, 0, 0, 0, 0, 0, 0, 0...","[2, 387, 636, 3573, 627, 224, 3, 0, 0, 0, 0, 0..."
41508,"[2, 1638, 1598, 398, 304, 224, 3, 0, 0, 0, 0, ...","[2, 1420, 918, 2526, 224, 3, 0, 0, 0, 0, 0, 0,..."
5372,"[2, 1823, 272, 290, 224, 3, 0, 0, 0, 0, 0, 0, ...","[2, 5328, 265, 290, 224, 3, 0, 0, 0, 0, 0, 0, ..."
95584,"[2, 270, 2038, 356, 1632, 272, 572, 304, 265, ...","[2, 265, 349, 950, 665, 15003, 323, 14194, 224..."
72045,"[2, 7832, 13583, 437, 300, 1366, 224, 3, 0, 0,...","[2, 3428, 284, 265, 321, 1429, 224, 3, 0, 0, 0..."
...,...,...
14008,"[2, 448, 2408, 334, 2246, 224, 3, 0, 0, 0, 0, ...","[2, 886, 321, 2834, 17753, 883, 6337, 224, 3, ..."
37021,"[2, 404, 334, 14295, 8359, 224, 3, 0, 0, 0, 0,...","[2, 690, 302, 590, 331, 4757, 3508, 224, 3, 0,..."
77830,"[2, 304, 803, 1430, 332, 272, 7519, 224, 3, 0,...","[2, 960, 831, 22894, 265, 321, 7205, 224, 3, 0..."
37688,"[2, 395, 595, 930, 18319, 224, 3, 0, 0, 0, 0, ...","[2, 6361, 5236, 16994, 224, 3, 0, 0, 0, 0, 0, ..."


In [None]:
import ast

df["English"] = df["English"].apply(ast.literal_eval)
df["Spanish"] = df["Spanish"].apply(ast.literal_eval)
df

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train_data = TensorDataset(
    torch.tensor(df["English"][:100000].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][:100000].tolist(), dtype=torch.long),
)
test_data = TensorDataset(
    torch.tensor(df["English"][100000:].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][100000:].tolist(), dtype=torch.long),
)
train_loader = DataLoader(train_data, batch_size=256, shuffle=True, drop_last=True)
test_loader = DataLoader(test_data, batch_size=256, shuffle=False, drop_last=True)

In [None]:
import json

with open(f"EnglishOrSpanish/vocab.json", "r") as f:
    vocab = json.load(f)

In [None]:
import torch
import torch.nn as nn


class Encoder(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, hidden_size, num_layers=2, dropout=0.3
    ):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout,
        )
        self.dropout = nn.Dropout(dropout)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.lstm(embedded)
        return output, hidden, cell

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size

        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, enc_out):
        hidden = hidden.unsqueeze(1)
        score = self.v(torch.tanh(self.W1(hidden) + self.W2(enc_out)))
        attention_weights = torch.softmax(score, dim=1)
        context_vector = torch.sum(attention_weights * enc_out, dim=1)
        return context_vector, attention_weights

In [None]:
import torch
import torch.nn as nn


class Decoder(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, hidden_size, num_layers=2, dropout=0.3
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            embedding_dim + hidden_size * 2,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True,
        )
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x, hidden, cell, enc_out):
        attention_vector, _ = self.attention(hidden[0], enc_out)
        x = self.embedding(x)
        x = torch.cat((attention_vector, x), dim=-1)
        x = x.unsqueeze(1)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        output = self.dropout(output)
        output = self.fc(output)
        return output, hidden, cell

In [None]:
import torch.nn as nn
import torch
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, vocab_size, teacher_forcing_ratio=0.5):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.vocab_size = vocab_size
        self.teacher_forcing_ratio = teacher_forcing_ratio

    def forward(self, src, trg):
        enc_out, hidden, cell = self.encoder(src)
        outputs = torch.zeros(trg.shape[0], trg.shape[1], self.decoder.vocab_size).to("cuda")
        x = trg[:, 0]
        for t in range(1, trg.shape[1]):
            output, hidden, cell = self.decoder(x, hidden, cell, enc_out)
            output = output.squeeze(1)
            outputs[:, t, :] = output
            use_teacher_forcing = random.random() < self.teacher_forcing_ratio
            x = trg[:, t] if use_teacher_forcing else output.argmax(dim=1)

        return outputs
    
    def predict(self, src):
        enc_out, hidden, cell = self.encoder(src)
        outputs = torch.zeros(src.shape[0], src.shape[1], self.decoder.vocab_size).to("cuda")
        x = src[:, 0]
        for i in range(1, src.shape[1]):
            output, hidden, cell = self.decoder(x, hidden, cell, enc_out)
            output = output.squeeze(1)
            outputs[:, i, :] = output
            x = output.argmax(dim=1)
        
        return outputs

In [None]:
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

def train(
        model,
        train_loader,
        test_loader,
        optimizer,
        criterion,
        device,
        scheduler, 
        num_epochs,
    ):
    writer = SummaryWriter()

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        model.encoder.train()
        model.decoder.train()
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc="Training", leave=False)

        for src, trg in progress_bar:
            src, trg = src.to(device), trg.to(device)
            optimizer.zero_grad()
            outputs = model(src, trg)
            loss = criterion(
                    outputs[:, 1:].reshape(-1, model.vocab_size), trg[:, 1:].reshape(-1)
                )
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            progress_bar.set_postfix(
                    loss=f"{loss.item():.4f}"
                )

        train_loss = epoch_loss / len(train_loader)

        # ---------------- VALIDATION ---------------- #
        model.encoder.eval()
        model.decoder.eval()
        val_epoch_loss = 0
        progress_bar = tqdm(test_loader, desc="Validating", leave=False)

        with torch.no_grad():
            for src, trg in progress_bar:
                src, trg = src.to(device), trg.to(device)
                outputs = model(src, trg)
                loss = criterion(
                        outputs[:, 1:].reshape(-1, model.vocab_size),
                        trg[:, 1:].reshape(-1),
                    )
                val_epoch_loss += loss.item()

                progress_bar.set_postfix(
                        loss=f"{loss.item():.4f}"
                    )

        val_loss = val_epoch_loss / len(test_loader)
        scheduler.step(val_loss)

        # Log metrics to TensorBoard
        writer.add_scalar("Loss/Train", train_loss, epoch)
        writer.add_scalar("Loss/Validation", val_loss, epoch)
        writer.add_scalar("Learning Rate", scheduler.get_last_lr()[0], epoch)

        print(
            f"Train loss : {train_loss}, Val_loss : {val_loss}, lr: {scheduler.get_last_lr()[0]}"
        )

    writer.close()
    return train_loss, val_loss

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)
embedding_dim = 256
hidden_size = 64
encoder = Encoder(vocab_size, embedding_dim, hidden_size).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_size).to(device)
seq2seq = Seq2Seq(encoder, decoder, vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(seq2seq.parameters(), lr=0.005, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=2
)

In [None]:
import torch
import torch.nn.functional as F
from tqdm import tqdm

train(
    seq2seq,
    train_loader,
    test_loader,
    optimizer,
    criterion,
    device,
    scheduler,
    60,
)


Epoch 1/60


                                                                          

Train loss : 5.460591206183801, Val_loss : 4.749622340750905, lr: 0.005

Epoch 2/60


                                                                        

KeyboardInterrupt: 

In [None]:
torch.save(seq2seq.state_dict(), "seq2seq-bidir-attention.pth")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)
embedding_dim = 256
hidden_size = 64
encoder = Encoder(vocab_size, embedding_dim, hidden_size).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_size).to(device)
seq2seq = Seq2Seq(encoder, decoder, vocab_size).to(device)
seq2seq.load_state_dict(torch.load("seq2seq-bidir-attention.pth", weights_only=True))

<All keys matched successfully>

In [None]:
seq2seq.eval()
# with torch.no_grad():
a, b = next(iter(train_loader))
a = a.to(device)
b = b.to(device)
with torch.no_grad():
    x = seq2seq.predict(a)

x = x.argmax(dim=2)
x.shape

torch.Size([512, 18])

In [None]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer(
    f"EnglishOrSpanish/vocab.json", f"EnglishOrSpanish/merges.txt"
)

In [None]:
print(tokenizer.decode(b.tolist()[8]))
print(tokenizer.decode(x.tolist()[8]))

<SOS> qué tipo de chica es usted <EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<PAD> qué tipo de chica es <EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS>
