In [1]:
import pandas as pd
from sklearn.utils import shuffle
df = pd.read_csv(f"EnglishOrSpanish/output_joint1.csv")
df = shuffle(df)
df

Unnamed: 0,English,Spanish
6524,"[2, 333, 2251, 878, 224, 3, 0, 0, 0, 0, 0, 0, ...","[2, 372, 2982, 895, 224, 3, 0, 0, 0, 0, 0, 0, ..."
48263,"[2, 333, 1294, 304, 272, 1034, 606, 224, 3, 0,...","[2, 401, 981, 316, 375, 7588, 627, 224, 3, 0, ..."
105549,"[2, 7319, 5646, 334, 300, 12898, 3942, 332, 14...","[2, 331, 6643, 5646, 318, 321, 3936, 500, 3975..."
37443,"[2, 290, 13989, 465, 5108, 224, 3, 0, 0, 0, 0,...","[2, 290, 21515, 352, 9074, 224, 3, 0, 0, 0, 0,..."
60512,"[2, 426, 580, 290, 663, 404, 398, 224, 3, 0, 0...","[2, 470, 444, 2160, 927, 690, 224, 3, 0, 0, 0,..."
...,...,...
100692,"[2, 270, 982, 290, 657, 803, 706, 304, 1042, 2...","[2, 1363, 316, 290, 2984, 375, 19476, 1400, 78..."
39722,"[2, 270, 1581, 361, 391, 300, 2356, 224, 3, 0,...","[2, 1325, 563, 387, 321, 2356, 224, 3, 0, 0, 0..."
34845,"[2, 270, 3154, 304, 576, 1044, 224, 3, 0, 0, 0...","[2, 3990, 316, 5530, 2164, 224, 3, 0, 0, 0, 0,..."
62465,"[2, 270, 4715, 402, 3395, 272, 479, 718, 224, ...","[2, 535, 372, 2498, 403, 3354, 265, 352, 1535,..."


In [2]:
import ast

df["English"] = df["English"].apply(ast.literal_eval)
df["Spanish"] = df["Spanish"].apply(ast.literal_eval)
df

Unnamed: 0,English,Spanish
6524,"[2, 333, 2251, 878, 224, 3, 0, 0, 0, 0, 0, 0, ...","[2, 372, 2982, 895, 224, 3, 0, 0, 0, 0, 0, 0, ..."
48263,"[2, 333, 1294, 304, 272, 1034, 606, 224, 3, 0,...","[2, 401, 981, 316, 375, 7588, 627, 224, 3, 0, ..."
105549,"[2, 7319, 5646, 334, 300, 12898, 3942, 332, 14...","[2, 331, 6643, 5646, 318, 321, 3936, 500, 3975..."
37443,"[2, 290, 13989, 465, 5108, 224, 3, 0, 0, 0, 0,...","[2, 290, 21515, 352, 9074, 224, 3, 0, 0, 0, 0,..."
60512,"[2, 426, 580, 290, 663, 404, 398, 224, 3, 0, 0...","[2, 470, 444, 2160, 927, 690, 224, 3, 0, 0, 0,..."
...,...,...
100692,"[2, 270, 982, 290, 657, 803, 706, 304, 1042, 2...","[2, 1363, 316, 290, 2984, 375, 19476, 1400, 78..."
39722,"[2, 270, 1581, 361, 391, 300, 2356, 224, 3, 0,...","[2, 1325, 563, 387, 321, 2356, 224, 3, 0, 0, 0..."
34845,"[2, 270, 3154, 304, 576, 1044, 224, 3, 0, 0, 0...","[2, 3990, 316, 5530, 2164, 224, 3, 0, 0, 0, 0,..."
62465,"[2, 270, 4715, 402, 3395, 272, 479, 718, 224, ...","[2, 535, 372, 2498, 403, 3354, 265, 352, 1535,..."


In [3]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train = TensorDataset(
    torch.tensor(df["English"][:100000].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][:100000].tolist(), dtype=torch.long),
)
test = TensorDataset(
    torch.tensor(df["English"][100000:].tolist(), dtype=torch.long),
    torch.tensor(df["Spanish"][100000:].tolist(), dtype=torch.long),
)
train_loader = DataLoader(train, batch_size=384, shuffle=True, drop_last=True)
test_loader = DataLoader(test, batch_size=384, shuffle=False, drop_last=True)

In [4]:
import json
with open(f"EnglishOrSpanish/vocab.json", "r") as f:
    vocab = json.load(f)

In [5]:
import torch
import torch.nn as nn


class Encoder(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, hidden_size, num_layers=2, dropout=0.3
    ):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
        )
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return output, hidden, cell

In [6]:
import torch
import torch.nn as nn


class Decoder(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, hidden_size, num_layers=2, dropout=0.3
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
        )
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.apply(self._init_weights)
        self.dropout = nn.Dropout(dropout)

    def _init_weights(self, module):
        if isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x, hidden, cell):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        output = self.dropout(output)
        output = self.fc(output)
        return output, hidden, cell

In [7]:
import torch.nn as nn
import torch
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, vocab_size, teacher_forcing_ratio=0.5):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.vocab_size = vocab_size
        self.teacher_forcing_ratio = teacher_forcing_ratio

    def forward(self, src, trg):
        _, hidden, cell = self.encoder(src)
        outputs = torch.zeros(trg.shape[0], trg.shape[1], self.decoder.vocab_size).to("cuda")
        x = trg[:, 0]
        for t in range(1, trg.shape[1]):
            output, hidden, cell = self.decoder(x, hidden, cell)
            output = output.squeeze(1)
            outputs[:, t, :] = output
            use_teacher_forcing = random.random() < self.teacher_forcing_ratio
            x = trg[:, t] if use_teacher_forcing else output.argmax(dim=1)

        return outputs
    
    def predict(self, src):
        _, hidden, cell = self.encoder(src)
        outputs = torch.zeros(src.shape[0], src.shape[1], self.decoder.vocab_size).to("cuda")
        x = src[:, 0]
        for i in range(1, src.shape[1]):
            output, hidden, cell = self.decoder(x, hidden, cell)
            output = output.squeeze(1)
            outputs[:, i, :] = output
            x = output.argmax(dim=1)
        
        return outputs

In [8]:
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm


def train(
    model,
    train_loader,
    test_loader,
    optimizer,
    criterion,
    device,
    scheduler,
    num_epochs,
):
    writer = SummaryWriter()

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        model.encoder.train()
        model.decoder.train()
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc="Training", leave=False)

        for src, trg in progress_bar:
            src, trg = src.to(device), trg.to(device)
            optimizer.zero_grad()
            outputs = model(src, trg)
            loss = criterion(
                outputs[:, 1:].reshape(-1, model.vocab_size), trg[:, 1:].reshape(-1)
            )
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            progress_bar.set_postfix(loss=f"{loss.item():.4f}")

        train_loss = epoch_loss / len(train_loader)

        # ---------------- VALIDATION ---------------- #
        model.encoder.eval()
        model.decoder.eval()
        val_epoch_loss = 0
        progress_bar = tqdm(test_loader, desc="Validating", leave=False)

        with torch.no_grad():
            for src, trg in progress_bar:
                src, trg = src.to(device), trg.to(device)
                outputs = model(src, trg)
                loss = criterion(
                    outputs[:, 1:].reshape(-1, model.vocab_size),
                    trg[:, 1:].reshape(-1),
                )
                val_epoch_loss += loss.item()

                progress_bar.set_postfix(loss=f"{loss.item():.4f}")

        val_loss = val_epoch_loss / len(test_loader)
        scheduler.step(val_loss)

        writer.add_scalar("Loss/Train", train_loss, epoch)
        writer.add_scalar("Loss/Validation", val_loss, epoch)
        writer.add_scalar("Learning Rate", scheduler.get_last_lr()[0], epoch)

        print(
            f"Train loss : {train_loss}, Val_loss : {val_loss}, lr: {scheduler.get_last_lr()[0]}"
        )

    writer.close()
    return train_loss, val_loss

2025-03-22 14:27:46.550663: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-22 14:27:46.558984: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742633866.569395   27402 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742633866.572386   27402 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-22 14:27:46.583086: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)
embedding_dim = 256
hidden_size = 32
encoder = Encoder(vocab_size, embedding_dim, hidden_size).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_size).to(device)
seq2seq = Seq2Seq(encoder, decoder, vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(seq2seq.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=2
)

In [10]:
import torch
import torch.nn.functional as F
from tqdm import tqdm

train(
    seq2seq,
    train_loader,
    test_loader,
    optimizer,
    criterion,
    device,
    scheduler,
    30,
)

                                                                        

Train loss : 6.962053581384512, Val_loss : 5.8326847394307455, lr: 0.001

Epoch 2/30


                                                                        

Train loss : 5.8681341849840605, Val_loss : 5.643837833404541, lr: 0.001

Epoch 3/30


                                                                        

Train loss : 5.674800544518691, Val_loss : 5.487871360778809, lr: 0.001

Epoch 4/30


                                                                        

Train loss : 5.536694702735314, Val_loss : 5.344801680246989, lr: 0.001

Epoch 5/30


                                                                        

Train loss : 5.403368581258333, Val_loss : 5.217676436106364, lr: 0.001

Epoch 6/30


                                                                        

Train loss : 5.310723365270174, Val_loss : 5.122572816212972, lr: 0.001

Epoch 7/30


                                                                        

Train loss : 5.23641071502979, Val_loss : 5.054786281585693, lr: 0.001

Epoch 8/30


                                                                        

Train loss : 5.173215878926791, Val_loss : 4.986257368723551, lr: 0.001

Epoch 9/30


                                                                        

Train loss : 5.12018260038816, Val_loss : 4.940228068033854, lr: 0.001

Epoch 10/30


                                                                        

Train loss : 5.055117221978994, Val_loss : 4.87892864227295, lr: 0.001

Epoch 11/30


                                                                        

Train loss : 5.0029106635313765, Val_loss : 4.814685509999593, lr: 0.001

Epoch 12/30


                                                                        

Train loss : 4.957641696929931, Val_loss : 4.776050332387288, lr: 0.001

Epoch 13/30


                                                                        

Train loss : 4.919378086236807, Val_loss : 4.729970893859863, lr: 0.001

Epoch 14/30


                                                                        

Train loss : 4.860567234112666, Val_loss : 4.702651405334473, lr: 0.001

Epoch 15/30


                                                                        

Train loss : 4.829343605041504, Val_loss : 4.673542900085449, lr: 0.001

Epoch 16/30


                                                                        

Train loss : 4.79398246545058, Val_loss : 4.616695721944173, lr: 0.001

Epoch 17/30


                                                                        

Train loss : 4.7584477186203005, Val_loss : 4.581541442871094, lr: 0.001

Epoch 18/30


                                                                        

Train loss : 4.719582620033851, Val_loss : 4.54927973429362, lr: 0.001

Epoch 19/30


                                                                        

Train loss : 4.691386985778808, Val_loss : 4.527570660909017, lr: 0.001

Epoch 20/30


                                                                        

Train loss : 4.647373337012071, Val_loss : 4.47168207804362, lr: 0.001

Epoch 21/30


                                                                        

Train loss : 4.619019486353948, Val_loss : 4.452079957326253, lr: 0.001

Epoch 22/30


                                                                        

Train loss : 4.585276961326599, Val_loss : 4.440540059407552, lr: 0.001

Epoch 23/30


                                                                        

Train loss : 4.54457033597506, Val_loss : 4.407544441223145, lr: 0.001

Epoch 24/30


                                                                        

Train loss : 4.511867902829096, Val_loss : 4.373706188201904, lr: 0.001

Epoch 25/30


                                                                        

Train loss : 4.485565504660973, Val_loss : 4.348975187937419, lr: 0.001

Epoch 26/30


                                                                        

Train loss : 4.4672951734983, Val_loss : 4.300933996836345, lr: 0.001

Epoch 27/30


                                                                        

Train loss : 4.428729394766001, Val_loss : 4.265014810562134, lr: 0.001

Epoch 28/30


                                                                        

Train loss : 4.413180483304537, Val_loss : 4.265251617431641, lr: 0.001

Epoch 29/30


                                                                        

Train loss : 4.383234728299654, Val_loss : 4.239299430847168, lr: 0.001

Epoch 30/30


                                                                        

Train loss : 4.351670787884639, Val_loss : 4.217301654815674, lr: 0.001




(4.351670787884639, 4.217301654815674)

In [11]:
torch.save(seq2seq.state_dict(), "seq2seq-noattention.pth")

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)
embedding_dim = 256
hidden_size = 32
encoder = Encoder(vocab_size, embedding_dim, hidden_size).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_size).to(device)
seq2seq = Seq2Seq(encoder, decoder, vocab_size).to(device)
seq2seq.load_state_dict(torch.load("seq2seq-noattention.pth", weights_only=True))

<All keys matched successfully>

In [14]:
seq2seq.eval()
# with torch.no_grad():
a, b = next(iter(train_loader))
a = a.to(device)
b = b.to(device)
with torch.no_grad():
    x = seq2seq.predict(a)

x = x.argmax(dim=2)
x.shape

torch.Size([384, 18])

In [15]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer(
    "EnglishOrSpanish/vocab.json", "EnglishOrSpanish/merges.txt"
)

In [18]:
print(tokenizer.decode(b.tolist()[2]))
print(tokenizer.decode(x.tolist()[2]))

<SOS> mira hacia atrás <EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<PAD> estamos  <EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS>
