## **1. Instalación de librerías**

In [1]:
!pip install sentencepiece sacrebleu datasets

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


# **2. Descargar dataset español–francés (HuggingFace)**


In [2]:
from datasets import load_dataset
import pandas as pd

print("Descargando dataset español-francés...")
data = load_dataset("opus_books", "es-fr")

src = [x["translation"]["es"] for x in data["train"]]
tgt = [x["translation"]["fr"] for x in data["train"]]

df = pd.DataFrame({"src": src, "tgt": tgt})

# Normalizar
df["src"] = df["src"].str.lower().str.strip()
df["tgt"] = df["tgt"].str.lower().str.strip()

# Filtrar oraciones muy largas
df = df[df["src"].str.len() < 120]
df = df[df["tgt"].str.len() < 120]

# Tomar máximo 50k
df = df.sample(min(50000, len(df)), random_state=42)

df.to_csv("dataset_es_fr.csv", index=False)
print("Dataset listo. Total pares:", len(df))

Descargando dataset español-francés...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

es-fr/train-00000-of-00001.parquet:   0%|          | 0.00/9.16M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56319 [00:00<?, ? examples/s]

Dataset listo. Total pares: 32556


# **3. Imports + Configuración**

In [3]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import sacrebleu
import sentencepiece as spm
from sklearn.model_selection import train_test_split

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando:", DEVICE)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

Usando: cuda


# **4. Cargar dataset y dividir**

In [4]:
df = pd.read_csv("dataset_es_fr.csv")

train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED)
train_df, val_df  = train_test_split(train_df, test_size=0.1, random_state=SEED)

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

Train: 26370 Val: 2930 Test: 3256


# **5. Entrenar SentencePiece**

In [5]:
os.makedirs("spm_models", exist_ok=True)

SRC_VOCAB = 4000
TGT_VOCAB = 4000

with open("spm_src.txt", "w", encoding="utf8") as f:
    f.write("\n".join(train_df["src"].tolist()))

with open("spm_tgt.txt", "w", encoding="utf8") as f:
    f.write("\n".join(train_df["tgt"].tolist()))

spm.SentencePieceTrainer.Train(
    f"--input=spm_src.txt --model_prefix=spm_models/spm_src "
    f"--vocab_size={SRC_VOCAB} --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3"
)

spm.SentencePieceTrainer.Train(
    f"--input=spm_tgt.txt --model_prefix=spm_models/spm_tgt "
    f"--vocab_size={TGT_VOCAB} --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3"
)

print("SentencePiece listo.")

SentencePiece listo.


# **6. Cargar tokenizers**

In [6]:
sp_src = spm.SentencePieceProcessor()
sp_tgt = spm.SentencePieceProcessor()

sp_src.load("spm_models/spm_src.model")
sp_tgt.load("spm_models/spm_tgt.model")

PAD = 0
BOS = 1
EOS = 2

def enc_src(t): return [BOS] + sp_src.encode(t, out_type=int) + [EOS]
def enc_tgt(t): return [BOS] + sp_tgt.encode(t, out_type=int) + [EOS]

def dec_tgt(ids):
    if EOS in ids: ids = ids[:ids.index(EOS)]
    if ids and ids[0] == BOS: ids = ids[1:]
    return sp_tgt.decode(ids)


# **7. Dataset + DataLoader**

In [9]:
class NMTDataset(Dataset):
    def __init__(self, df):
        self.src = df["src"].tolist()
        self.tgt = df["tgt"].tolist()

    def __len__(self): return len(self.src)

    def __getitem__(self, idx):
        s = enc_src(self.src[idx])
        t = enc_tgt(self.tgt[idx])
        return torch.tensor(s), torch.tensor(t[:-1]), torch.tensor(t[1:])

In [10]:
def pad_batch(seqs):
    max_len = max(len(s) for s in seqs)
    out = torch.full((len(seqs), max_len), PAD)
    for i, s in enumerate(seqs):
        out[i, :len(s)] = s
    return out

def collate(batch):
    src, tin, tout = zip(*batch)
    src_pad = pad_batch(src)
    tin_pad = pad_batch(tin)
    tout_pad = pad_batch(tout)

    lengths = torch.tensor([len(s) for s in src])
    lengths, idx = lengths.sort(descending=True)

    return src_pad[idx], lengths, tin_pad[idx], tout_pad[idx]

In [11]:
train_loader = DataLoader(NMTDataset(train_df), 64, True, collate_fn=collate)
val_loader   = DataLoader(NMTDataset(val_df),   64, False, collate_fn=collate)
test_loader  = DataLoader(NMTDataset(test_df),  64, False, collate_fn=collate)

# **8. Atención LUONG (dot)**

In [12]:
class LuongAttention(nn.Module):
    def __init__(self, hidden):
        super().__init__()
        self.hidden = hidden

    def forward(self, hidden, encoder_outputs):
        # hidden: (1, B, H)
        # encoder_outputs: (B, T, H)
        hidden = hidden.permute(1, 0, 2)   # (B, 1, H)

        # dot score
        scores = torch.bmm(encoder_outputs, hidden.transpose(1, 2))  # (B, T, 1)

        attn_weights = torch.softmax(scores, dim=1)  # (B, T, 1)
        context = torch.sum(attn_weights * encoder_outputs, dim=1)  # (B, H)

        return context, attn_weights

# **9. Encoder GRU**

In [13]:
class Encoder(nn.Module):
    def __init__(self, vocab, emb, hid):
        super().__init__()
        self.emb = nn.Embedding(vocab, emb, padding_idx=PAD)
        self.gru = nn.GRU(emb, hid, batch_first=True)

    def forward(self, src, lengths):
        x = self.emb(src)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True)
        outputs, h = self.gru(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        return outputs, h

# **10. Decoder GRU + Atención Luong**

In [14]:
class Decoder(nn.Module):
    def __init__(self, vocab, emb, hid):
        super().__init__()
        self.emb = nn.Embedding(vocab, emb, padding_idx=PAD)
        self.att = LuongAttention(hid)
        self.gru = nn.GRU(emb + hid, hid, batch_first=True)
        self.fc  = nn.Linear(hid, vocab)

    def forward(self, tgt_in, h, encoder_outputs):
        x = self.emb(tgt_in)

        outputs = []
        for t in range(x.size(1)):
            context, _ = self.att(h, encoder_outputs)
            gru_in = torch.cat([x[:, t:t+1, :], context.unsqueeze(1)], dim=2)
            out, h = self.gru(gru_in, h)
            outputs.append(self.fc(out))
        return torch.cat(outputs, dim=1)

# **11. Seq2Seq completo**

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, enc, dec):
        super().__init__()
        self.enc = enc
        self.dec = dec

    def forward(self, src, lengths, tgt_in):
        enc_out, h = self.enc(src, lengths)
        return self.dec(tgt_in, h, enc_out)

    def translate(self, text, max_len=40):
        self.eval()
        with torch.no_grad():
            ids = torch.tensor([enc_src(text)], device=DEVICE)
            lengths = torch.tensor([ids.size(1)], device=DEVICE)

            enc_out, h = self.enc(ids, lengths)
            cur = torch.tensor([[BOS]], device=DEVICE)
            gen = []

            for _ in range(max_len):
                out = self.dec(cur, h, enc_out)
                nxt = out[0, -1].argmax().item()
                if nxt == EOS: break
                gen.append(nxt)
                cur = torch.tensor([[nxt]], device=DEVICE)

            return dec_tgt(gen)

# **12. Inicializar modelo + optimizer**

In [16]:
EMB = 256
HID = 512

enc = Encoder(sp_src.get_piece_size(), EMB, HID)
dec = Decoder(sp_tgt.get_piece_size(), EMB, HID)
model = Seq2Seq(enc, dec).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=PAD)
opt = torch.optim.Adam(model.parameters(), lr=0.0005)

# **13. Entrenamiento**

In [17]:
def train_epoch():
    model.train()
    total = 0
    for src, lengths, tin, tout in train_loader:
        src, lengths = src.to(DEVICE), lengths.to(DEVICE)
        tin, tout = tin.to(DEVICE), tout.to(DEVICE)

        opt.zero_grad()
        pred = model(src, lengths, tin)
        loss = criterion(pred.reshape(-1, pred.size(-1)), tout.reshape(-1))
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        total += loss.item()
    return total / len(train_loader)

def val_epoch():
    model.eval()
    total = 0
    with torch.no_grad():
        for src, lengths, tin, tout in val_loader:
            src, lengths = src.to(DEVICE), lengths.to(DEVICE)
            tin, tout = tin.to(DEVICE), tout.to(DEVICE)
            pred = model(src, lengths, tin)
            loss = criterion(pred.reshape(-1, pred.size(-1)), tout.reshape(-1))
            total += loss.item()
    return total / len(val_loader)

In [18]:
EPOCHS = 12
for ep in range(1, EPOCHS+1):
    tr = train_epoch()
    vl = val_epoch()
    print(f"Epoch {ep} | Train {tr:.4f} | Val {vl:.4f}")

Epoch 1 | Train 5.0470 | Val 4.3749
Epoch 2 | Train 4.0418 | Val 3.8852
Epoch 3 | Train 3.6006 | Val 3.6448
Epoch 4 | Train 3.2839 | Val 3.5058
Epoch 5 | Train 3.0173 | Val 3.4133
Epoch 6 | Train 2.7749 | Val 3.3708
Epoch 7 | Train 2.5490 | Val 3.3630
Epoch 8 | Train 2.3360 | Val 3.3761
Epoch 9 | Train 2.1308 | Val 3.4121
Epoch 10 | Train 1.9346 | Val 3.4632
Epoch 11 | Train 1.7552 | Val 3.5206
Epoch 12 | Train 1.5809 | Val 3.5905


# **14. Traducciones de prueba**

In [None]:
tests = [
    "hola, ¿cómo estás?",
    "mañana voy a estudiar en la biblioteca",
    "me gusta la comida francesa",
    "estoy aprendiendo modelos de traducción automática"
]

for t in tests:
    print("ES:", t)
    print("FR:", model.translate(t))
    print("-"*40)

# **15. BLEU**

In [None]:
hyps, refs = [], []

with torch.no_grad():
    for src, lengths, tin, tout in test_loader:
        src, lengths = src.to(DEVICE), lengths.to(DEVICE)
        for i in range(src.size(0)):
            ids = src[i][:lengths[i]].tolist()
            if EOS in ids:
                ids = ids[1:ids.index(EOS)]
            else:
                ids = ids[1:]

            src_txt = sp_src.decode(ids)
            hyp = model.translate(src_txt)
            gold = sp_tgt.decode([x for x in tout[i].tolist() if x not in [PAD,BOS,EOS]])

            hyps.append(hyp)
            refs.append([gold])

bleu = sacrebleu.corpus_bleu(hyps, list(zip(*refs)))
print("BLEU:", bleu.score)

Epoch 1 | Train 3.2282 | Val 3.7071
Epoch 2 | Train 3.1328 | Val 3.6984
Epoch 3 | Train 3.0557 | Val 3.7040
Epoch 4 | Train 2.9767 | Val 3.7043
Epoch 5 | Train 2.9040 | Val 3.7220
Epoch 6 | Train 2.8325 | Val 3.7390
Epoch 7 | Train 2.7641 | Val 3.7420
Epoch 8 | Train 2.7034 | Val 3.7612
Epoch 9 | Train 2.6391 | Val 3.7844
Epoch 10 | Train 2.5817 | Val 3.8194
Epoch 11 | Train 2.5234 | Val 3.8351
Epoch 12 | Train 2.4699 | Val 3.8565
Epoch 13 | Train 2.4163 | Val 3.8799
Epoch 14 | Train 2.3650 | Val 3.9152
Epoch 15 | Train 2.3162 | Val 3.9475
