## **1. Instalación de librerías**

In [2]:
!pip install sentencepiece sacrebleu datasets

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


# **2. Descargar dataset español–francés (HuggingFace)**


In [2]:
from datasets import load_dataset
import pandas as pd

print("Descargando dataset español-francés...")
data = load_dataset("opus_books", "es-fr")

src = [x["translation"]["es"] for x in data["train"]]
tgt = [x["translation"]["fr"] for x in data["train"]]

df = pd.DataFrame({"src": src, "tgt": tgt})

# Normalizar
df["src"] = df["src"].str.lower().str.strip()
df["tgt"] = df["tgt"].str.lower().str.strip()

# Filtrar oraciones muy largas
df = df[df["src"].str.len() < 120]
df = df[df["tgt"].str.len() < 120]

# Tomar máximo 50k
df = df.sample(min(50000, len(df)), random_state=42)

df.to_csv("dataset_es_fr.csv", index=False)
print("Dataset listo. Total pares:", len(df))

Descargando dataset español-francés...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

es-fr/train-00000-of-00001.parquet:   0%|          | 0.00/9.16M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56319 [00:00<?, ? examples/s]

Dataset listo. Total pares: 32556


# **3. Imports + Configuración**

In [3]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import sacrebleu
import sentencepiece as spm
from sklearn.model_selection import train_test_split

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando:", DEVICE)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

Usando: cuda


# **4. Cargar dataset y dividir**

In [4]:
df = pd.read_csv("dataset_es_fr.csv")

train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED)
train_df, val_df  = train_test_split(train_df, test_size=0.1, random_state=SEED)

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

Train: 26370 Val: 2930 Test: 3256


# **5. Entrenar SentencePiece**

In [5]:
os.makedirs("spm_models", exist_ok=True)

SRC_VOCAB = 4000
TGT_VOCAB = 4000

# Crear archivos de entrenamiento para SPM
with open("spm_src.txt", "w", encoding="utf8") as f:
    f.write("\n".join(train_df["src"].tolist()))

with open("spm_tgt.txt", "w", encoding="utf8") as f:
    f.write("\n".join(train_df["tgt"].tolist()))

# Entrenar SentencePiece para src (español)
spm.SentencePieceTrainer.Train(
    f"--input=spm_src.txt --model_prefix=spm_models/spm_src "
    f"--vocab_size={SRC_VOCAB} --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3"
)

# Entrenar SentencePiece para tgt (francés)
spm.SentencePieceTrainer.Train(
    f"--input=spm_tgt.txt --model_prefix=spm_models/spm_tgt "
    f"--vocab_size={TGT_VOCAB} --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3"
)

print("Tokenizers SentencePiece entrenados.")

Tokenizers SentencePiece entrenados.


# **6. Cargar tokenizers**

In [6]:
sp_src = spm.SentencePieceProcessor()
sp_tgt = spm.SentencePieceProcessor()

sp_src.load("spm_models/spm_src.model")
sp_tgt.load("spm_models/spm_tgt.model")

PAD = 0
BOS = 1
EOS = 2

def enc_src(text):
    return [BOS] + sp_src.encode(text, out_type=int) + [EOS]

def enc_tgt(text):
    return [BOS] + sp_tgt.encode(text, out_type=int) + [EOS]

def dec_tgt(ids):
    if EOS in ids:
        ids = ids[:ids.index(EOS)]
    if ids and ids[0] == BOS:
        ids = ids[1:]
    return sp_tgt.decode(ids)

# **7. Dataset + DataLoader**

In [7]:
class NMTDataset(Dataset):
    def __init__(self, df):
        self.src = df["src"].tolist()
        self.tgt = df["tgt"].tolist()

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        s = enc_src(self.src[idx])
        t = enc_tgt(self.tgt[idx])
        # tgt_in: sin último token, tgt_out: sin primero
        return torch.tensor(s), torch.tensor(t[:-1]), torch.tensor(t[1:])

In [8]:
def pad_batch(seqs):
    max_len = max(len(s) for s in seqs)
    out = torch.full((len(seqs), max_len), PAD)
    for i, s in enumerate(seqs):
        out[i, :len(s)] = s
    return out

def collate_fn(batch):
    src, tgt_in, tgt_out = zip(*batch)
    src_pad = pad_batch(src)      # (B, S)
    tgt_in_pad = pad_batch(tgt_in)  # (B, T)
    tgt_out_pad = pad_batch(tgt_out) # (B, T)
    return src_pad, tgt_in_pad, tgt_out_pad

train_loader = DataLoader(NMTDataset(train_df), batch_size=64, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(NMTDataset(val_df),   batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(NMTDataset(test_df),  batch_size=64, shuffle=False, collate_fn=collate_fn)

print("Batches train:", len(train_loader), "Batches val:", len(val_loader))

Batches train: 413 Batches val: 46


# **8. Positional Encoding (estilo Transformer original)**

In [9]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x):
        # x: (seq_len, batch, d_model)
        seq_len = x.size(0)
        return x + self.pe[:seq_len]

# **9. Transformer NMT (Encoder–Decoder con auto-atención y atención cruzada)**

In [10]:
class TransformerNMT(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        tgt_vocab_size,
        d_model=256,
        nhead=4,
        num_encoder_layers=2,
        num_decoder_layers=2,
        dim_feedforward=512,
        dropout=0.1,
    ):
        super().__init__()
        self.d_model = d_model

        self.src_embed = nn.Embedding(src_vocab_size, d_model, padding_idx=PAD)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, d_model, padding_idx=PAD)

        self.pos_encoder = PositionalEncoding(d_model)
        self.pos_decoder = PositionalEncoding(d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=False  # trabajamos como (S, B, E)
        )

        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def make_src_key_padding_mask(self, src):
        # src: (B, S)
        return (src == PAD)  # (B, S) True donde hay PAD

    def make_tgt_key_padding_mask(self, tgt):
        # tgt: (B, T)
        return (tgt == PAD)

    def make_tgt_subsequent_mask(self, size):
        # Máscara triangular inferior para impedir ver el futuro
        mask = torch.triu(torch.ones(size, size) == 1, diagonal=1)
        # True donde se debe enmascarar
        return mask  # (T, T) bool

    def forward(self, src, tgt_in):
        # src: (B, S), tgt_in: (B, T)
        src_key_padding_mask = self.make_src_key_padding_mask(src)  # (B, S)
        tgt_key_padding_mask = self.make_tgt_key_padding_mask(tgt_in)  # (B, T)
        tgt_mask = self.make_tgt_subsequent_mask(tgt_in.size(1)).to(src.device)  # (T, T)

        # Embedding + pos encoding
        src_emb = self.src_embed(src) * math.sqrt(self.d_model)  # (B, S, E)
        tgt_emb = self.tgt_embed(tgt_in) * math.sqrt(self.d_model)  # (B, T, E)

        # Pasar a (S, B, E)
        src_emb = src_emb.transpose(0, 1)  # (S, B, E)
        tgt_emb = tgt_emb.transpose(0, 1)  # (T, B, E)

        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.pos_decoder(tgt_emb)

        output = self.transformer(
            src=src_emb,
            tgt=tgt_emb,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=src_key_padding_mask,
        )  # (T, B, E)

        output = output.transpose(0, 1)  # (B, T, E)
        logits = self.fc_out(output)     # (B, T, vocab_tgt)

        return logits

    def translate(self, text, max_len=40):
        self.eval()
        with torch.no_grad():
            src_ids = torch.tensor([enc_src(text)], device=DEVICE)  # (1, S)
            src_key_padding_mask = self.make_src_key_padding_mask(src_ids)
            src_emb = self.src_embed(src_ids) * math.sqrt(self.d_model)
            src_emb = src_emb.transpose(0, 1)  # (S, 1, E)
            src_emb = self.pos_encoder(src_emb)

            memory = self.transformer.encoder(
                src_emb,
                src_key_padding_mask=src_key_padding_mask
            )  # (S, 1, E)

            # Decoding autoregresivo
            generated = [BOS]
            for _ in range(max_len):
                tgt_in = torch.tensor([generated], device=DEVICE)  # (1, len)
                tgt_emb = self.tgt_embed(tgt_in) * math.sqrt(self.d_model)
                tgt_emb = tgt_emb.transpose(0, 1)  # (T, 1, E)
                tgt_emb = self.pos_decoder(tgt_emb)

                tgt_mask = self.make_tgt_subsequent_mask(tgt_in.size(1)).to(DEVICE)
                tgt_key_padding_mask = self.make_tgt_key_padding_mask(tgt_in)

                out = self.transformer.decoder(
                    tgt_emb,
                    memory,
                    tgt_mask=tgt_mask,
                    tgt_key_padding_mask=tgt_key_padding_mask,
                    memory_key_padding_mask=src_key_padding_mask,
                )  # (T, 1, E)

                logits = self.fc_out(out[-1])  # (1, vocab)
                next_token = logits.argmax(-1).item()

                if next_token == EOS:
                    break
                generated.append(next_token)

            return dec_tgt(generated)

# **10. Inicializar modelo + criterio + optimizador**

In [11]:
d_model = 256
nhead = 4
num_enc_layers = 2
num_dec_layers = 2
ff_dim = 512
dropout = 0.1

model = TransformerNMT(
    src_vocab_size=sp_src.get_piece_size(),
    tgt_vocab_size=sp_tgt.get_piece_size(),
    d_model=d_model,
    nhead=nhead,
    num_encoder_layers=num_enc_layers,
    num_decoder_layers=num_dec_layers,
    dim_feedforward=ff_dim,
    dropout=dropout
).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=PAD)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

print("Modelo listo. Parámetros entrenables:",
      sum(p.numel() for p in model.parameters() if p.requires_grad))



Modelo listo. Parámetros entrenables: 5712800


# **11. Funciones de entrenamiento y validación**

In [12]:
def train_epoch():
    model.train()
    total = 0
    for src, tgt_in, tgt_out in train_loader:
        src = src.to(DEVICE)
        tgt_in = tgt_in.to(DEVICE)
        tgt_out = tgt_out.to(DEVICE)

        optimizer.zero_grad()
        logits = model(src, tgt_in)  # (B, T, vocab)

        loss = criterion(
            logits.reshape(-1, logits.size(-1)),
            tgt_out.reshape(-1)
        )

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total += loss.item()
    return total / len(train_loader)

def eval_epoch():
    model.eval()
    total = 0
    with torch.no_grad():
        for src, tgt_in, tgt_out in val_loader:
            src = src.to(DEVICE)
            tgt_in = tgt_in.to(DEVICE)
            tgt_out = tgt_out.to(DEVICE)

            logits = model(src, tgt_in)
            loss = criterion(
                logits.reshape(-1, logits.size(-1)),
                tgt_out.reshape(-1)
            )
            total += loss.item()
    return total / len(val_loader)

# **13. Entrenamiento**

In [13]:
EPOCHS = 10  # puedes subir a 12-15 si ves que el tiempo da

for ep in range(1, EPOCHS + 1):
    tr = train_epoch()
    vl = eval_epoch()
    print(f"Epoch {ep:02d} | Train: {tr:.4f} | Val: {vl:.4f}")

Epoch 01 | Train: 5.0788 | Val: 4.4151
Epoch 02 | Train: 4.2246 | Val: 4.0164
Epoch 03 | Train: 3.8730 | Val: 3.7893
Epoch 04 | Train: 3.6228 | Val: 3.6312
Epoch 05 | Train: 3.4271 | Val: 3.5281
Epoch 06 | Train: 3.2554 | Val: 3.4472
Epoch 07 | Train: 3.1147 | Val: 3.3869
Epoch 08 | Train: 2.9865 | Val: 3.3457
Epoch 09 | Train: 2.8675 | Val: 3.3126
Epoch 10 | Train: 2.7599 | Val: 3.2883


# **14. Traducciones de prueba**

In [1]:
tests = [
    "hola, ¿cómo estás?",
    "nos vemos mañana por la mañana",
    "me gusta la comida francesa",
    "estoy aprendiendo modelos de traducción automática",
    "el libro está sobre la mesa",
]

for t in tests:
    print("ES:", t)
    print("FR:", model.translate(t))
    print("-" * 40)

ES: hola, ¿cómo estás?


NameError: name 'model' is not defined

# **15. BLEU**

In [None]:
model.eval()
hyps, refs = [], []

with torch.no_grad():
    for src, tgt_in, tgt_out in test_loader:
        src = src.to(DEVICE)
        for i in range(src.size(0)):
            src_ids = src[i].tolist()
            # cortar en EOS y quitar BOS
            if EOS in src_ids:
                src_ids = src_ids[1:src_ids.index(EOS)]
            else:
                src_ids = src_ids[1:]
            src_text = sp_src.decode(src_ids)

            hyp = model.translate(src_text)

            # referencia (tgt_out está ya desplazado, quitamos PAD/BOS/EOS)
            tgt_ids = tgt_out[i].tolist()
            ref_ids = [x for x in tgt_ids if x not in [PAD, BOS, EOS]]
            ref_text = sp_tgt.decode(ref_ids)

            hyps.append(hyp)
            refs.append([ref_text])

bleu = sacrebleu.corpus_bleu(hyps, list(zip(*refs)))
print("BLEU:", bleu.score)

Epoch 1 | Train 3.2282 | Val 3.7071
Epoch 2 | Train 3.1328 | Val 3.6984
Epoch 3 | Train 3.0557 | Val 3.7040
Epoch 4 | Train 2.9767 | Val 3.7043
Epoch 5 | Train 2.9040 | Val 3.7220
Epoch 6 | Train 2.8325 | Val 3.7390
Epoch 7 | Train 2.7641 | Val 3.7420
Epoch 8 | Train 2.7034 | Val 3.7612
Epoch 9 | Train 2.6391 | Val 3.7844
Epoch 10 | Train 2.5817 | Val 3.8194
Epoch 11 | Train 2.5234 | Val 3.8351
Epoch 12 | Train 2.4699 | Val 3.8565
Epoch 13 | Train 2.4163 | Val 3.8799
Epoch 14 | Train 2.3650 | Val 3.9152
Epoch 15 | Train 2.3162 | Val 3.9475
