In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import re
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-03-23 19:14:46.818953: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-23 19:14:46.827176: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742737486.837719   15724 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742737486.840707   15724 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-23 19:14:46.851232: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
def preproc(x):
    x = re.sub(r"[^a-zA-Z0-9\s]", "", x)
    return x.lower()


def preproc_spanish(text):
    text = re.sub(r"[^a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ\s]", "", text)
    return text.lower()


df = pd.read_csv("spa.txt.csv")
df["English"] = df["English"].apply(preproc)
df["Translated"] = df["Translated"].apply(preproc_spanish)

eng_sentences = df["English"].values
spa_sentences = df["Translated"].values

In [3]:
eng_tokenizer = Tokenizer(filters="")
spa_tokenizer = Tokenizer(filters="")
eng_tokenizer.fit_on_texts(eng_sentences)
spa_tokenizer.fit_on_texts(spa_sentences)

eng_vocab_size = len(eng_tokenizer.word_index) + 1
spa_vocab_size = len(spa_tokenizer.word_index) + 1

eng_sequences = eng_tokenizer.texts_to_sequences(eng_sentences)
spa_sequences = spa_tokenizer.texts_to_sequences(spa_sentences)

max_eng_len = max(len(seq) for seq in eng_sequences)
max_spa_len = max(len(seq) for seq in spa_sequences)

eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding="post")
spa_padded = pad_sequences(spa_sequences, maxlen=max_spa_len, padding="post")

# Convert to Torch tensors
eng_tensor = torch.tensor(eng_padded, dtype=torch.long)
spa_tensor = torch.tensor(spa_padded, dtype=torch.long)

dataset = TensorDataset(eng_tensor, spa_tensor[:, :-1], spa_tensor[:, 1:])

In [4]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size]
)

train_loader = DataLoader(train_dataset, batch_size=160, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=160, shuffle=False)

In [5]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        output, (h, c) = self.lstm(x)
        return output, h, c

In [6]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(hidden_dim, hidden_dim)
        self.W2 = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)

    def forward(self, query, values):
        query = query.squeeze(0).unsqueeze(1)
        score = self.V(torch.tanh(self.W1(query) + self.W2(values)))
        attention_weights = torch.softmax(score, dim=1)
        context_vector = torch.sum(attention_weights * values, dim=1)
        return context_vector

In [7]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.attention = BahdanauAttention(hidden_dim)

    def forward(self, x, hidden, enc_output):
        context_vector = self.attention(hidden[0], enc_output)
        x = self.embedding(x)
        x = torch.cat([context_vector.unsqueeze(1), x], dim=-1)
        output, (h, c) = self.lstm(x, hidden)
        return self.fc(output), (h, c)

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inputs, targets):
        enc_output, h, c = self.encoder(inputs)
        dec_hidden = (h, c)
        dec_input = targets[:, 0].unsqueeze(1)
        all_predictions = []

        for t in range(targets.shape[1]):
            predictions, dec_hidden = self.decoder(dec_input, dec_hidden, enc_output)
            all_predictions.append(predictions)
            dec_input = targets[:, t].unsqueeze(1)

        return torch.cat(all_predictions, dim=1)
    
    def predict(self, inputs):
        enc_output, h, c = self.encoder(inputs)
        dec_hidden = (h, c)
        dec_input = inputs[:, 0].unsqueeze(1)
        all_predictions = []

        for t in range(inputs.shape[1]):
            predictions, dec_hidden = self.decoder(dec_input, dec_hidden, enc_output)
            all_predictions.append(predictions)
            dec_input = predictions.argmax(dim=2)

        return torch.cat(all_predictions, dim=1)

In [None]:
embedding_dim = 256
hidden_dim = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

encoder = Encoder(eng_vocab_size, embedding_dim, hidden_dim).to(device)
decoder = Decoder(spa_vocab_size, embedding_dim, hidden_dim).to(device)
model = Seq2Seq(encoder, decoder).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())
scheduler = ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=3, min_lr=1e-6
)

Using device: cuda


In [14]:
def validate(model, val_loader, criterion):
    """Evaluates the model on the validation set."""
    model.eval()
    total_loss = 0

    with torch.no_grad():
        val_tqdm = tqdm(val_loader, desc="Validation", leave=False)
        for inputs, targets_in, targets_out in val_tqdm:
            inputs, targets_in, targets_out = (
                inputs.to(device),
                targets_in.to(device),
                targets_out.to(device),
            )

            outputs = model(inputs, targets_in)
            loss = criterion(outputs.view(-1, spa_vocab_size), targets_out.view(-1))
            total_loss += loss.item()

            val_tqdm.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(val_loader)
    print(f"Validation Loss: {avg_loss:.4f}")

    return avg_loss

In [15]:
from torch.utils.tensorboard import SummaryWriter

def train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=50):
    
    writer = SummaryWriter()
    """Trains the model with validation."""
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for inputs, targets_in, targets_out in progress_bar:
            inputs, targets_in, targets_out = (
                inputs.to(device),
                targets_in.to(device),
                targets_out.to(device),
            )

            optimizer.zero_grad()
            outputs = model(inputs, targets_in)

            loss = criterion(outputs.view(-1, spa_vocab_size), targets_out.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            progress_bar.set_postfix(
                loss=loss.item()
            )

        avg_loss = total_loss / len(train_loader)

        val_loss = validate(model, val_loader, criterion)
        scheduler.step(val_loss)
        writer.add_scalar("Loss/train", avg_loss, epoch)
        writer.add_scalar("Loss/val", val_loss, epoch)
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_loss:.4f} | Val Loss: {val_loss:.4f} | Learning Rate: {current_lr:.6f}")


In [16]:
train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=50)

torch.save(model.state_dict(), "seq2seq-wortok.pth")

                                                                        

Validation Loss: 6.6479
Epoch 1/50 | Train Loss: 6.7451 | Val Loss: 6.6479 | Learning Rate: 0.001000


                                                                        

Validation Loss: 6.4017
Epoch 2/50 | Train Loss: 6.4441 | Val Loss: 6.4017 | Learning Rate: 0.001000


                                                                        

Validation Loss: 6.1805
Epoch 3/50 | Train Loss: 6.1731 | Val Loss: 6.1805 | Learning Rate: 0.001000


                                                                        

Validation Loss: 5.9825
Epoch 4/50 | Train Loss: 5.9124 | Val Loss: 5.9825 | Learning Rate: 0.001000


                                                                        

Validation Loss: 5.8206
Epoch 5/50 | Train Loss: 5.6850 | Val Loss: 5.8206 | Learning Rate: 0.001000


                                                                        

Validation Loss: 5.6776
Epoch 6/50 | Train Loss: 5.4801 | Val Loss: 5.6776 | Learning Rate: 0.001000


                                                                        

Validation Loss: 5.5515
Epoch 7/50 | Train Loss: 5.2913 | Val Loss: 5.5515 | Learning Rate: 0.001000


                                                                        

Validation Loss: 5.4374
Epoch 8/50 | Train Loss: 5.1154 | Val Loss: 5.4374 | Learning Rate: 0.001000


                                                                        

Validation Loss: 5.3370
Epoch 9/50 | Train Loss: 4.9509 | Val Loss: 5.3370 | Learning Rate: 0.001000


                                                                         

Validation Loss: 5.2423
Epoch 10/50 | Train Loss: 4.7966 | Val Loss: 5.2423 | Learning Rate: 0.001000


                                                                         

Validation Loss: 5.1643
Epoch 11/50 | Train Loss: 4.6518 | Val Loss: 5.1643 | Learning Rate: 0.001000


                                                                         

Validation Loss: 5.0867
Epoch 12/50 | Train Loss: 4.5165 | Val Loss: 5.0867 | Learning Rate: 0.001000


                                                                         

Validation Loss: 5.0215
Epoch 13/50 | Train Loss: 4.3881 | Val Loss: 5.0215 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.9635
Epoch 14/50 | Train Loss: 4.2702 | Val Loss: 4.9635 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.9097
Epoch 15/50 | Train Loss: 4.1558 | Val Loss: 4.9097 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.8579
Epoch 16/50 | Train Loss: 4.0488 | Val Loss: 4.8579 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.8161
Epoch 17/50 | Train Loss: 3.9501 | Val Loss: 4.8161 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.7714
Epoch 18/50 | Train Loss: 3.8554 | Val Loss: 4.7714 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.7391
Epoch 19/50 | Train Loss: 3.7666 | Val Loss: 4.7391 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.7089
Epoch 20/50 | Train Loss: 3.6814 | Val Loss: 4.7089 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.6830
Epoch 21/50 | Train Loss: 3.6030 | Val Loss: 4.6830 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.6581
Epoch 22/50 | Train Loss: 3.5277 | Val Loss: 4.6581 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.6338
Epoch 23/50 | Train Loss: 3.4570 | Val Loss: 4.6338 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.6095
Epoch 24/50 | Train Loss: 3.3887 | Val Loss: 4.6095 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5992
Epoch 25/50 | Train Loss: 3.3254 | Val Loss: 4.5992 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5819
Epoch 26/50 | Train Loss: 3.2660 | Val Loss: 4.5819 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5622
Epoch 27/50 | Train Loss: 3.2074 | Val Loss: 4.5622 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5622
Epoch 28/50 | Train Loss: 3.1533 | Val Loss: 4.5622 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5435
Epoch 29/50 | Train Loss: 3.1014 | Val Loss: 4.5435 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5328
Epoch 30/50 | Train Loss: 3.0531 | Val Loss: 4.5328 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5244
Epoch 31/50 | Train Loss: 3.0055 | Val Loss: 4.5244 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5201
Epoch 32/50 | Train Loss: 2.9607 | Val Loss: 4.5201 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5145
Epoch 33/50 | Train Loss: 2.9184 | Val Loss: 4.5145 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5102
Epoch 34/50 | Train Loss: 2.8772 | Val Loss: 4.5102 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5074
Epoch 35/50 | Train Loss: 2.8393 | Val Loss: 4.5074 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5008
Epoch 36/50 | Train Loss: 2.8037 | Val Loss: 4.5008 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5009
Epoch 37/50 | Train Loss: 2.7670 | Val Loss: 4.5009 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5050
Epoch 38/50 | Train Loss: 2.7319 | Val Loss: 4.5050 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5035
Epoch 39/50 | Train Loss: 2.6998 | Val Loss: 4.5035 | Learning Rate: 0.001000


                                                                         

Validation Loss: 4.5048
Epoch 40/50 | Train Loss: 2.6696 | Val Loss: 4.5048 | Learning Rate: 0.000500


                                                                         

Validation Loss: 4.4828
Epoch 41/50 | Train Loss: 2.5835 | Val Loss: 4.4828 | Learning Rate: 0.000500


                                                                         

Validation Loss: 4.4890
Epoch 42/50 | Train Loss: 2.5512 | Val Loss: 4.4890 | Learning Rate: 0.000500


                                                                         

Validation Loss: 4.4903
Epoch 43/50 | Train Loss: 2.5312 | Val Loss: 4.4903 | Learning Rate: 0.000500


                                                                         

Validation Loss: 4.4970
Epoch 44/50 | Train Loss: 2.5152 | Val Loss: 4.4970 | Learning Rate: 0.000500


                                                                         

Validation Loss: 4.5037
Epoch 45/50 | Train Loss: 2.4993 | Val Loss: 4.5037 | Learning Rate: 0.000250


                                                                         

Validation Loss: 4.4987
Epoch 46/50 | Train Loss: 2.4542 | Val Loss: 4.4987 | Learning Rate: 0.000250


                                                                         

Validation Loss: 4.5037
Epoch 47/50 | Train Loss: 2.4381 | Val Loss: 4.5037 | Learning Rate: 0.000250


                                                                         

Validation Loss: 4.5045
Epoch 48/50 | Train Loss: 2.4285 | Val Loss: 4.5045 | Learning Rate: 0.000250


                                                                         

Validation Loss: 4.5086
Epoch 49/50 | Train Loss: 2.4200 | Val Loss: 4.5086 | Learning Rate: 0.000125


                                                                         

Validation Loss: 4.5097
Epoch 50/50 | Train Loss: 2.3960 | Val Loss: 4.5097 | Learning Rate: 0.000125


# Evaluation

In [9]:
embedding_dim = 256
hidden_dim = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(eng_vocab_size, embedding_dim, hidden_dim).to(device)
decoder = Decoder(spa_vocab_size, embedding_dim, hidden_dim).to(device)
model = Seq2Seq(encoder, decoder).to(device)

model.load_state_dict(
    torch.load("seq2seq-wortok.pth", weights_only=True)
)

<All keys matched successfully>

In [10]:
model.eval()
a, b, c = next(iter(val_loader))
a = a.to(device)
b = b.to(device)
c = c.to(device)
with torch.no_grad():
    x = model.predict(a)
x = x.argmax(dim=2)
x.shape

torch.Size([160, 47])

In [11]:
x = x.cpu().numpy()
b = b.cpu().numpy()
a = a.cpu().numpy()
preds = []
targets = []
original = []
for i in range(x.shape[0]):
    preds.append(spa_tokenizer.sequences_to_texts([x[i]])[0])
    targets.append([spa_tokenizer.sequences_to_texts([b[i]])[0]])
    original.append([eng_tokenizer.sequences_to_texts([a[i]])[0]])

In [12]:
import pandas as pd

df = pd.DataFrame(
    {"Original": original[:10], "Prediction": preds[:10], "Target": targets[:10]}
)
df.to_csv("attention-wordlevel.csv", index=False)
df

Unnamed: 0,Original,Prediction,Target
0,[good to see you tom],si me ver tom tom tom tom a a a a a a a a a a ...,[qué bueno verte tom]
1,[i thought tom would get here ahead of us],nos que me aquí de nosotros nosotros nosotros ...,[pensé que tom llegaría aquí antes que nosotros]
2,[he asked his friends for help],la que a ayuda ayuda sus a ayuda su ayuda la a...,[él le pidió ayuda a sus amigos]
3,[tom was a tank commander],fue un de con guerra abril guerra trigo guerra...,[tom fue un comandante de tanques]
4,[she doesnt understand you],no te lo tú lo tú lo ti usted entiende verdad ...,[ella no les entiende]
5,[this box is not as big as that one],caja no tan como grande mi es grande mi es que...,[esta caja no es tan grande como esa]
6,[we should go],ir ir ir ayudar ir caza ladrillo grafitis trig...,[deberíamos irnos]
7,[thats not my wife],que es mi esposa mi padre mi padre su idea a p...,[esa no es mi esposa]
8,[who taught you that],se que eso eso eso eso eso eso eso eso eso eso...,[quién te ha enseñado eso]
9,[tom says he doesnt think its possible to do t...,dice que no que eso hacer hacer hacer hacerlo ...,[tom dice que no cree que sea posible hacer eso]


In [None]:
import sacrebleu


def calculate_bleu_score(preds, targets):
    bleu = sacrebleu.corpus_bleu(preds, targets, force=True)  
    return bleu.score

bleu_score = calculate_bleu_score(preds, targets)
print(f"BLEU Score: {bleu_score}")

BLEU Score: 2.264164155209104


In [None]:
import rouge

rouge = rouge.Rouge()
flat_targets = [t[0] for t in targets]  
scores = rouge.get_scores(preds, flat_targets, avg=True)
scores

{'rouge-1': {'r': 0.409396884773723,
  'p': 0.19576793228415262,
  'f': 0.2399486631711552},
 'rouge-2': {'r': 0.06920765866078366,
  'p': 0.021683212182092855,
  'f': 0.029892009074832405},
 'rouge-l': {'r': 0.37724361626935154,
  'p': 0.1793810192524139,
  'f': 0.22004146595394855}}

In [17]:
def ter(preds, targets):
    return sacrebleu.corpus_ter(preds, [targets]).score


ter(preds, flat_targets)

690.0191938579654