In [1]:
# Mini-Transformer: traducir números (es -> en) a nivel de caracteres
# ----------------------------------------------------------
# Ejemplo didáctico: dataset pequeño (0–99), modelo Transformer sencillo
# Requiere: torch
# ----------------------------------------------------------

import math, random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ----------------------------------------------------------
# 1. Dataset: números en texto (español -> inglés)
# ----------------------------------------------------------
UNITS_EN = ["zero","one","two","three","four","five","six","seven","eight","nine",
            "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen",
            "seventeen","eighteen","nineteen"]
TENS_EN = ["","","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]

UNITS_ES = ["cero","uno","dos","tres","cuatro","cinco","seis","siete","ocho","nueve",
            "diez","once","doce","trece","catorce","quince","dieciseis","diecisiete",
            "dieciocho","diecinueve"]
TENS_ES = ["","","veinte","treinta","cuarenta","cincuenta","sesenta","setenta","ochenta","noventa"]

def num_to_en(n:int) -> str:
    if n < 20: return UNITS_EN[n]
    tens, unit = n//10, n%10
    return TENS_EN[tens] if unit==0 else TENS_EN[tens]+" "+UNITS_EN[unit]

def num_to_es(n:int) -> str:
    if n < 20: return UNITS_ES[n]
    tens, unit = n//10, n%10
    if unit==0: return TENS_ES[tens]
    if tens==2: return "veinti"+UNITS_ES[unit]  # veintiuno, veintidos...
    return TENS_ES[tens]+" "+UNITS_ES[unit]

pairs = [(num_to_es(n), num_to_en(n)) for n in range(100)]

# pequeña data augmentation: variantes y duplicados
aug_pairs = []
for n in range(21,30):
    aug_pairs.append(("veinti "+UNITS_ES[n%10], num_to_en(n)))
pairs += aug_pairs
pairs += [(src,tgt) for (src,tgt) in pairs if random.random()<0.2]

random.shuffle(pairs)
print("Dataset size:", len(pairs))

# ----------------------------------------------------------
# 2. Tokenización carácter a carácter
# ----------------------------------------------------------
ALL_TEXT = " ".join([s for s,_ in pairs] + [t for _,t in pairs])
chars = sorted(set(ALL_TEXT))

PAD,BOS,EOS,UNK = "<pad>","<bos>","<eos>","<unk>"
itos = [PAD,BOS,EOS,UNK]+chars
stoi = {ch:i for i,ch in enumerate(itos)}

PAD_IDX, BOS_IDX, EOS_IDX, UNK_IDX = stoi[PAD], stoi[BOS], stoi[EOS], stoi[UNK]
vocab_size = len(itos)
print("Vocab size:", vocab_size)

def encode(s): return [stoi.get(ch,UNK_IDX) for ch in s.lower()]
def decode(idxs):
    out=[]
    for i in idxs:
        if i in (PAD_IDX,BOS_IDX): continue
        if i==EOS_IDX: break
        out.append(itos[i])
    return "".join(out)

# ----------------------------------------------------------
# 3. Dataset/Dataloader
# ----------------------------------------------------------
class NumDataset(Dataset):
    def __init__(self,pairs,max_len=12):
        self.data=[(encode(src),encode(tgt)) for src,tgt in pairs]
        self.max_len=max_len
    def __len__(self): return len(self.data)
    def __getitem__(self,idx):
        src,tgt=self.data[idx]
        src=[BOS_IDX]+src+[EOS_IDX]
        tgt=[BOS_IDX]+tgt+[EOS_IDX]
        return torch.tensor(src[:self.max_len]), torch.tensor(tgt[:self.max_len])

def collate(batch):
    srcs,tgts=zip(*batch)
    max_s,max_t=max(len(s) for s in srcs),max(len(t) for t in tgts)
    S=torch.full((len(batch),max_s),PAD_IDX)
    T=torch.full((len(batch),max_t),PAD_IDX)
    for i,(s,t) in enumerate(zip(srcs,tgts)):
        S[i,:len(s)],T[i,:len(t)] = s,t
    return S.long(),T.long()

split=int(0.8*len(pairs))
train_ds, val_ds = NumDataset(pairs[:split]), NumDataset(pairs[split:])
train_loader=DataLoader(train_ds,batch_size=16,shuffle=True,collate_fn=collate)
val_loader=DataLoader(val_ds,batch_size=32,collate_fn=collate)

# ----------------------------------------------------------
# 4. Modelo Transformer
# ----------------------------------------------------------
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_len=5000):
        super().__init__()
        pe=torch.zeros(max_len,d_model)
        pos=torch.arange(0,max_len).unsqueeze(1).float()
        div=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000)/d_model))
        pe[:,0::2]=torch.sin(pos*div); pe[:,1::2]=torch.cos(pos*div)
        self.register_buffer("pe",pe)
    def forward(self,x): return x+self.pe[:x.size(1)].unsqueeze(0)

class MiniTransformer(nn.Module):
    def __init__(self,vocab_size,d_model=128,nhead=4,num_layers=2):
        super().__init__()
        self.d_model=d_model
        self.src_emb=nn.Embedding(vocab_size,d_model,padding_idx=PAD_IDX)
        self.tgt_emb=nn.Embedding(vocab_size,d_model,padding_idx=PAD_IDX)
        self.pos=PositionalEncoding(d_model)
        self.transf=nn.Transformer(d_model,nhead,num_layers,num_layers,
                                   dim_feedforward=256,batch_first=True)
        self.fc=nn.Linear(d_model,vocab_size)
    def forward(self,src,tgt):
        src_mask=None
        tgt_mask=torch.triu(torch.full((tgt.size(1),tgt.size(1)),-float('inf')),1).to(device)
        src_key=(src==PAD_IDX); tgt_key=(tgt==PAD_IDX)
        src=self.pos(self.src_emb(src)*math.sqrt(self.d_model))
        tgt=self.pos(self.tgt_emb(tgt)*math.sqrt(self.d_model))
        out=self.transf(src,tgt,tgt_mask=tgt_mask,
                        src_key_padding_mask=src_key,
                        tgt_key_padding_mask=tgt_key,
                        memory_key_padding_mask=src_key)
        return self.fc(out)

model=MiniTransformer(vocab_size).to(device)
optimizer=torch.optim.Adam(model.parameters(),lr=1e-3)
criterion=nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# ----------------------------------------------------------
# 5. Entrenamiento
# ----------------------------------------------------------
def shift_right(t): return t[:,:-1]
n_epochs=100
for ep in range(1,n_epochs+1):
    model.train(); tot=0
    for S,T in train_loader:
        S,T=S.to(device),T.to(device)
        inp=shift_right(T); out=T[:,1:]
        logits=model(S,inp)
        loss=criterion(logits[:,:out.size(1)].reshape(-1,vocab_size),
                       out.reshape(-1))
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        tot+=loss.item()
    if ep%10==0:
        print(f"Epoch {ep} | Train loss {tot/len(train_loader):.3f}")

# ----------------------------------------------------------
# 6. Decodificación greedy
# ----------------------------------------------------------
def greedy_decode(model,src,max_len=15):
    model.eval()
    src=src.unsqueeze(0).to(device)
    tgt=[BOS_IDX]
    for _ in range(max_len):
        tgt_t=torch.tensor([tgt],device=device)
        logits=model(src,tgt_t)
        next=logits[0,-1].argmax().item()
        if next==EOS_IDX: break
        tgt.append(next)
    return decode(tgt)

# ----------------------------------------------------------
# 7. Ejemplos
# ----------------------------------------------------------
examples=["cero","quince","veintidos","treinta y cinco","noventa y nueve"]
for ex in examples:
    src=torch.tensor([BOS_IDX]+encode(ex)+[EOS_IDX])
    print(f"{ex:15s} -> {greedy_decode(model,src)}")


Device: cpu
Dataset size: 134
Vocab size: 26




Epoch 10 | Train loss 0.277
Epoch 20 | Train loss 0.131
Epoch 30 | Train loss 0.075
Epoch 40 | Train loss 0.068
Epoch 50 | Train loss 0.065
Epoch 60 | Train loss 0.059
Epoch 70 | Train loss 0.075
Epoch 80 | Train loss 0.055
Epoch 90 | Train loss 0.056
Epoch 100 | Train loss 0.044
cero            -> zero
quince          -> fiften
veintidos       -> twenty two
treinta y cinco -> thirty eighthth
noventa y nueve -> ninety nine
