In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import re
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-03-22 12:40:55.444376: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-22 12:40:55.452463: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742627455.462418   12757 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742627455.465336   12757 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-22 12:40:55.475687: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
def preproc(x):
    x = re.sub(r"[^a-zA-Z0-9\s]", "", x)
    return x.lower()


def preproc_spanish(text):
    text = re.sub(r"[^a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ\s]", "", text)
    return text.lower()


# Load dataset
df = pd.read_csv("spa.txt.csv")
df["English"] = df["English"].apply(preproc)
df["Translated"] = df["Translated"].apply(preproc_spanish)

eng_sentences = df["English"].values
spa_sentences = df["Translated"].values

In [3]:
eng_tokenizer = Tokenizer(filters="")
spa_tokenizer = Tokenizer(filters="")
eng_tokenizer.fit_on_texts(eng_sentences)
spa_tokenizer.fit_on_texts(spa_sentences)

eng_vocab_size = len(eng_tokenizer.word_index) + 1
spa_vocab_size = len(spa_tokenizer.word_index) + 1

eng_sequences = eng_tokenizer.texts_to_sequences(eng_sentences)
spa_sequences = spa_tokenizer.texts_to_sequences(spa_sentences)

max_eng_len = max(len(seq) for seq in eng_sequences)
max_spa_len = max(len(seq) for seq in spa_sequences)

eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding="post")
spa_padded = pad_sequences(spa_sequences, maxlen=max_spa_len, padding="post")

# Convert to Torch tensors
eng_tensor = torch.tensor(eng_padded, dtype=torch.long)
spa_tensor = torch.tensor(spa_padded, dtype=torch.long)

dataset = TensorDataset(eng_tensor, spa_tensor[:, :-1], spa_tensor[:, 1:])

In [4]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size]
)

train_loader = DataLoader(train_dataset, batch_size=160, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=160, shuffle=False)

In [5]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        output, (h, c) = self.lstm(x)
        return output, h, c

In [6]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(hidden_dim, hidden_dim)
        self.W2 = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)

    def forward(self, query, values):
        query = query.squeeze(0).unsqueeze(1)
        score = self.V(torch.tanh(self.W1(query) + self.W2(values)))
        attention_weights = torch.softmax(score, dim=1)
        context_vector = torch.sum(attention_weights * values, dim=1)
        return context_vector

In [35]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.attention = BahdanauAttention(hidden_dim)

    def forward(self, x, hidden, enc_output):
        context_vector = self.attention(hidden[0], enc_output)
        x = self.embedding(x)
        x = torch.cat([context_vector.unsqueeze(1), x], dim=-1)
        output, (h, c) = self.lstm(x, hidden)
        return self.fc(output), (h, c)

In [32]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inputs, targets):
        enc_output, h, c = self.encoder(inputs)
        dec_hidden = (h, c)
        dec_input = targets[:, 0].unsqueeze(1)
        all_predictions = []

        for t in range(targets.shape[1]):
            predictions, dec_hidden = self.decoder(dec_input, dec_hidden, enc_output)
            all_predictions.append(predictions)
            dec_input = targets[:, t].unsqueeze(1)

        return torch.cat(all_predictions, dim=1)
    
    def predict(self, inputs):
        enc_output, h, c = self.encoder(inputs)
        dec_hidden = (h, c)
        dec_input = inputs[:, 0].unsqueeze(1)
        all_predictions = []

        for t in range(inputs.shape[1]):
            predictions, dec_hidden = self.decoder(dec_input, dec_hidden, enc_output)
            all_predictions.append(predictions)
            dec_input = predictions.argmax(dim=2)

        return torch.cat(all_predictions, dim=1)

In [9]:
embedding_dim = 256
hidden_dim = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to GPU
encoder = Encoder(eng_vocab_size, embedding_dim, hidden_dim).to(device)
decoder = Decoder(spa_vocab_size, embedding_dim, hidden_dim).to(device)
model = Seq2Seq(encoder, decoder).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())
scheduler = ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=3, min_lr=1e-6
)

Using device: cuda


In [None]:
def validate(model, val_loader, criterion):
    """Evaluates the model on the validation set."""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        val_tqdm = tqdm(val_loader, desc="Validation", leave=False)
        for inputs, targets_in, targets_out in val_tqdm:
            inputs, targets_in, targets_out = (
                inputs.to(device),
                targets_in.to(device),
                targets_out.to(device),
            )

            outputs = model(inputs, targets_in)
            loss = criterion(outputs.view(-1, spa_vocab_size), targets_out.view(-1))
            total_loss += loss.item()

            # Compute accuracy
            predictions = outputs.argmax(dim=-1)
            mask = targets_out != 0
            correct += (predictions == targets_out).masked_select(mask).sum().item()
            total += mask.sum().item()

            # Update tqdm bar with current loss
            val_tqdm.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(val_loader)
    accuracy = correct / total if total > 0 else 0
    print(f"Validation Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

    return avg_loss, accuracy

In [None]:
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=50):
    """Trains the model with validation."""
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for inputs, targets_in, targets_out in progress_bar:
            inputs, targets_in, targets_out = (
                inputs.to(device),
                targets_in.to(device),
                targets_out.to(device),
            )

            optimizer.zero_grad()
            outputs = model(inputs, targets_in)

            loss = criterion(outputs.view(-1, spa_vocab_size), targets_out.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Compute accuracy
            predictions = outputs.argmax(dim=-1)
            mask = targets_out != 0
            correct += (predictions == targets_out).masked_select(mask).sum().item()
            total += mask.sum().item()

            progress_bar.set_postfix(
                loss=loss.item(), accuracy=correct / total if total > 0 else 0
            )

        avg_loss = total_loss / len(train_loader)

        # Run validation step
        val_loss, val_acc = validate(model, val_loader, criterion)
        scheduler.step(val_loss)
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_loss:.4f} | Train Acc: {correct / total} Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.4f} | Learning Rate: {current_lr:.6f}")


In [None]:
train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=50)

# Save model
torch.save(model.state_dict(), "seq2seq-chatgpt.pth")

In [None]:
encoder = Encoder(eng_vocab_size, embedding_dim, hidden_dim).to(device)
decoder = Decoder(spa_vocab_size, embedding_dim, hidden_dim).to(device)
model = Seq2Seq(encoder, decoder).to(device)

model.load_state_dict(
    torch.load("seq2seq-chatgpt.pth", weights_only=True)
)

<All keys matched successfully>

In [39]:
model.eval()
# with torch.no_grad():
a, b, c = next(iter(train_loader))
a = a.to(device)
b = b.to(device)
c = c.to(device)
with torch.no_grad():
    x = model.predict(a)
x = x.argmax(dim=2)
x.shape

torch.Size([160, 47])

In [55]:
pd.DataFrame([c[0].tolist(), x[0].tolist()]).T

Unnamed: 0,0,1
0,8.0,301.0
1,48.0,8.0
2,1.0,382.0
3,228.0,99.0
4,621.0,621.0
5,0.0,650.0
6,0.0,621.0
7,0.0,650.0
8,0.0,621.0
9,0.0,621.0


In [64]:
print(spa_tokenizer.sequences_to_texts([b[6].tolist()]))
print(spa_tokenizer.sequences_to_texts([x[6].tolist()]))

['estoy tan devastado']
['tan como vikingos pelo kilogramos castaño hidrógeno agachaba y cristianismo caminó hueso las por rojo numerosas violentas caminó 19 perdedora ensució enjugó mojadas ayunando las débil imperceptible golf ladrón cuerda incendio despertaras tumbarte regresamos regresamos despedirnos la rota arena la premio rock la rota club la con']
