In [1]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("shusrith/machine-trainslation")

# print("Path to dataset files:", path)

In [1]:
import pandas as pd
from sklearn.utils import shuffle

df = pd.read_csv(f"EnglishOrSpanish/output_joint1.csv")
df = shuffle(df)
df

Unnamed: 0,English,Spanish
114623,"[2, 22056, 14224, 449, 7920, 674, 930, 13054, ...","[2, 664, 22056, 14225, 17303, 294, 1850, 2761,..."
57225,"[2, 270, 3279, 673, 3968, 7959, 224, 3, 0, 0, ...","[2, 566, 294, 1986, 355, 4578, 387, 8624, 224,..."
35491,"[2, 270, 384, 684, 272, 341, 370, 224, 3, 0, 0...","[2, 535, 2668, 265, 550, 563, 224, 3, 0, 0, 0,..."
78889,"[2, 270, 431, 574, 270, 411, 496, 3540, 224, 3...","[2, 302, 850, 316, 4208, 7143, 224, 3, 0, 0, 0..."
102889,"[2, 270, 807, 3159, 272, 718, 328, 3452, 300, ...","[2, 372, 3576, 265, 1234, 316, 328, 6715, 265,..."
...,...,...
83704,"[2, 826, 270, 1364, 391, 448, 7489, 2209, 224,...","[2, 665, 2053, 556, 1135, 322, 474, 5449, 224,..."
106144,"[2, 290, 1578, 462, 4486, 24099, 2349, 300, 21...","[2, 290, 3346, 801, 4929, 294, 5116, 1941, 294..."
58936,"[2, 442, 1354, 521, 1083, 1251, 1847, 224, 3, ...","[2, 463, 372, 7591, 986, 1093, 1081, 224, 3, 0..."
79731,"[2, 361, 384, 402, 1344, 272, 2320, 300, 1024,...","[2, 767, 403, 7029, 470, 4485, 321, 1432, 224,..."


In [2]:
import ast

df["English"] = df["English"].apply(ast.literal_eval)
df["Spanish"] = df["Spanish"].apply(ast.literal_eval)
df

Unnamed: 0,English,Spanish
114623,"[2, 22056, 14224, 449, 7920, 674, 930, 13054, ...","[2, 664, 22056, 14225, 17303, 294, 1850, 2761,..."
57225,"[2, 270, 3279, 673, 3968, 7959, 224, 3, 0, 0, ...","[2, 566, 294, 1986, 355, 4578, 387, 8624, 224,..."
35491,"[2, 270, 384, 684, 272, 341, 370, 224, 3, 0, 0...","[2, 535, 2668, 265, 550, 563, 224, 3, 0, 0, 0,..."
78889,"[2, 270, 431, 574, 270, 411, 496, 3540, 224, 3...","[2, 302, 850, 316, 4208, 7143, 224, 3, 0, 0, 0..."
102889,"[2, 270, 807, 3159, 272, 718, 328, 3452, 300, ...","[2, 372, 3576, 265, 1234, 316, 328, 6715, 265,..."
...,...,...
83704,"[2, 826, 270, 1364, 391, 448, 7489, 2209, 224,...","[2, 665, 2053, 556, 1135, 322, 474, 5449, 224,..."
106144,"[2, 290, 1578, 462, 4486, 24099, 2349, 300, 21...","[2, 290, 3346, 801, 4929, 294, 5116, 1941, 294..."
58936,"[2, 442, 1354, 521, 1083, 1251, 1847, 224, 3, ...","[2, 463, 372, 7591, 986, 1093, 1081, 224, 3, 0..."
79731,"[2, 361, 384, 402, 1344, 272, 2320, 300, 1024,...","[2, 767, 403, 7029, 470, 4485, 321, 1432, 224,..."


In [3]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train = TensorDataset(
    torch.tensor(df["English"][:100000].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][:100000].tolist(), dtype=torch.long),
)
test = TensorDataset(
    torch.tensor(df["English"][100000:].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][100000:].tolist(), dtype=torch.long),
)
train_loader = DataLoader(train, batch_size=256, shuffle=True, drop_last=True)
test_loader = DataLoader(test, batch_size=256, shuffle=False, drop_last=True)

In [4]:
import json

with open(f"EnglishOrSpanish/vocab.json", "r") as f:
    vocab = json.load(f)

In [5]:
import torch
import torch.nn as nn


class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=1, dropout=0.3):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.lstm = nn.LSTM(
            hidden_size,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
        )

        self.layer_norm = nn.LayerNorm(hidden_size)
        self.hidden_norm = nn.LayerNorm(hidden_size)
        self.cell_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.lstm(embedded)
        output = self.layer_norm(output)
        hidden = self.hidden_norm(hidden)
        cell = self.cell_norm(cell)
        return output, hidden, cell

In [6]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size

        self.attn = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_output):
        hidden = hidden.unsqueeze(1)
        scores = self.attn(encoder_output)
        scores = torch.tanh(scores + hidden)
        attention = self.v(scores).squeeze(2)
        return nn.functional.softmax(attention, dim=1)

In [7]:
import torch
import torch.nn as nn


class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, attention, num_layers=1, dropout=0.3):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)

        self.lstm = nn.LSTM(
            2 * hidden_size,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
        )
        self.attention = attention

        self.layer_norm = nn.LayerNorm(hidden_size)
        self.hidden_norm = nn.LayerNorm(hidden_size)
        self.cell_norm = nn.LayerNorm(hidden_size)

        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x, hidden, cell, encoder_output):
        attention_scores = self.attention(hidden[-1], encoder_output)
        context = torch.bmm(attention_scores.unsqueeze(1), encoder_output)
        embedded = self.dropout(self.embedding(x)).unsqueeze(1)
        embedded = torch.cat([embedded, context], dim=2)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        output = self.layer_norm(output)
        hidden = self.hidden_norm(hidden)
        cell = self.cell_norm(cell)
        output = self.fc(output)
        return output, hidden, cell

In [8]:
class Seq2Seq:
    def __init__(self, encoder, decoder, device, teacher_forcing_ratio):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.vocab_size = self.decoder.vocab_size
        self.teacher_forcing_ratio = teacher_forcing_ratio

    def train(
        self,
        train_loader,
        test_loader,
        enc_optimizer,
        dec_optimizer,
        criterion,
        device,
        encoder_scheduler,
        decoder_scheduler,
        num_epochs,
    ):
        prev_lr_enc = enc_optimizer.param_groups[0]["lr"]
        prev_lr_dec = dec_optimizer.param_groups[0]["lr"]
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch+1}/{num_epochs}")
            self.encoder.train()
            self.decoder.train()
            epoch_loss = 0
            epoch_acc = 0
            progress_bar = tqdm(train_loader, desc="Training", leave=False)

            for src, trg in progress_bar:
                src, trg = src.to(device), trg.to(device)

                batch_size, trg_len = trg.shape

                enc_optimizer.zero_grad()
                dec_optimizer.zero_grad()

                encoder_out, hidden, cell = self.encoder(src)
                outputs = torch.zeros(batch_size, trg_len, self.vocab_size).to(
                    self.device
                )
                x = trg[:, 0]
                for t in range(1, trg_len):
                    output, hidden, cell = self.decoder(x, hidden, cell, encoder_out)
                    output = output.squeeze(1)
                    outputs[:, t, :] = output
                    teacher_force = torch.rand(1).item() < self.teacher_forcing_ratio
                    x = trg[:, t] if teacher_force else output.argmax(dim=1)
                loss = criterion(
                    outputs[:, 1:].reshape(-1, self.vocab_size), trg[:, 1:].reshape(-1)
                )
                loss.backward()
                enc_optimizer.step()
                dec_optimizer.step()

                epoch_loss += loss.item()

                preds = outputs.argmax(dim=2)
                correct = (preds == trg).float().sum().item()
                total = trg.numel()
                batch_acc = correct / total
                epoch_acc += batch_acc

                progress_bar.set_postfix(
                    loss=f"{loss.item():.4f}", acc=f"{batch_acc:.4f}"
                )

            train_loss = epoch_loss / len(train_loader)
            train_acc = epoch_acc / len(train_loader)

            # ---------------- VALIDATION ---------------- #
            self.encoder.eval()
            self.decoder.eval()
            val_epoch_loss = 0
            val_epoch_acc = 0
            progress_bar = tqdm(test_loader, desc="Validating", leave=False)

            with torch.no_grad():
                for src, trg in progress_bar:
                    src, trg = src.to(device), trg.to(device)
                    batch_size, trg_len = trg.shape
                    encoder_out, hidden, cell = self.encoder(src)
                    outputs = torch.zeros(batch_size, trg_len, self.vocab_size).to(
                        self.device
                    )
                    x = trg[:, 0]
                    for t in range(1, trg_len):
                        output, hidden, cell = self.decoder(
                            x, hidden, cell, encoder_out
                        )
                        output = output.squeeze(1)
                        outputs[:, t, :] = output
                        x = output.argmax(dim=1)

                    loss = criterion(
                        outputs[:, 1:].reshape(-1, self.vocab_size),
                        trg[:, 1:].reshape(-1),
                    )
                    val_epoch_loss += loss.item()

                    preds = outputs.argmax(dim=2)
                    correct = (preds == trg).float().sum().item()
                    total = trg.numel()
                    batch_acc = correct / total
                    val_epoch_acc += batch_acc

                    progress_bar.set_postfix(
                        loss=f"{loss.item():.4f}", acc=f"{batch_acc:.4f}"
                    )

            val_loss = val_epoch_loss / len(test_loader)
            val_acc = val_epoch_acc / len(test_loader)
            print(
                f"Train loss : {train_loss}, Train accuracy : {train_acc}, Val_loss : {val_loss}, val accuracy : {val_acc}"
            )
            encoder_scheduler.step(val_loss)
            decoder_scheduler.step(val_loss)
            if encoder_scheduler.get_last_lr()[0] < prev_lr_enc:
                prev_lr_enc = encoder_scheduler.get_last_lr()[0]
                print(f"Encoder learning rate decreased to {prev_lr_enc}")
            if decoder_scheduler.get_last_lr()[0] < prev_lr_dec:
                prev_lr_dec = decoder_scheduler.get_last_lr()[0]
        return train_loss, train_acc, val_loss, val_acc

    def predict(self, src, trg):
        self.encoder.eval()
        self.decoder.eval()
        with torch.no_grad():
            src, trg = src.to(self.device), trg.to(self.device)
            trg_len = trg.shape[0]
            _, hidden, cell = self.encoder(src)
            outputs = torch.zeros(1, trg_len, self.vocab_size).to(self.device)
            x = trg[0]
            for t in range(1, trg_len):
                output, hidden, cell = self.decoder(x, hidden, cell)
                outputs[:, t, :] = output
                x = output.argmax(dim=1)
            return outputs

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)

hidden_size = 256
encoder = Encoder(vocab_size, hidden_size).to(device)
attention = Attention(hidden_size)
decoder = Decoder(vocab_size, hidden_size, attention).to(device)
seq2seq = Seq2Seq(encoder, decoder, device, 0.5)
criterion = nn.CrossEntropyLoss(ignore_index=0)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.005, weight_decay=1e-5)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.005, weight_decay=1e-5)
encoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    encoder_optimizer, mode="min", factor=0.6, patience=2, min_lr=1e-6
)
decoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    decoder_optimizer, mode="min", factor=0.6, patience=2, min_lr=1e-6
)



In [10]:
import torch
import torch.nn.functional as F
from tqdm import tqdm

seq2seq.train(
    train_loader,
    test_loader,
    encoder_optimizer,
    decoder_optimizer,
    criterion,
    device,
    encoder_scheduler,
    decoder_scheduler,
    100,
)


Epoch 1/100


                                                                                   

KeyboardInterrupt: 

In [12]:
torch.save(seq2seq.encoder.state_dict(), "encoder.pth")
torch.save(seq2seq.decoder.state_dict(), "decoder.pth")

In [None]:
import time
for i in range(10):
    print(i)
    time.sleep(1)

0
1
2
3


In [None]:
!shutdown now