In [None]:
# INSTALACIONES NECESARIAS
!pip install kagglehub --quiet
!pip install transformers[torch] accelerate -U --quiet # accelerate para mejor uso de GPU/TPU

In [None]:
import os
import re
import string
import random
import kagglehub
import pandas as pd # Usaremos pandas para manejar el dataset más fácilmente
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm # Para barras de progreso

# --- CONFIGURACIÓN ---
MODEL_NAME = "distilgpt2"
MAX_LENGTH = 64
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 5e-5
VAL_SPLIT_RATIO = 0.1
TEST_SPLIT_RATIO = 0.1
SEED = 42

SEP_TOKEN = "<|sep|>" # Separador entre inglés y español
START_SPA_TOKEN = "[start]" # Inicio de la frase en español
END_SPA_TOKEN = "[end]"   # Fin de la frase en español

# --- PASO 1: DESCARGA Y PREPARACIÓN DEL DATASET ---
print("Descargando dataset...")
path = kagglehub.dataset_download("tejasurya/eng-spanish")
print("Archivos disponibles:", os.listdir(path))

archivo = os.path.join(path, "spa.txt")

# Leer y procesar el dataset
text_pairs = []
with open(archivo, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")[:-1] # Omitir la última línea si está vacía

for line in lines:
    parts = line.split("\t")
    if len(parts) >= 2:
        english_text = parts[0]
        spanish_text = parts[1] # No añadimos [start] y [end] aquí, lo haremos al formatear para GPT-2
        text_pairs.append((english_text, spanish_text))

print(f"Número total de pares: {len(text_pairs)}")
print("Ejemplo de par:", text_pairs[0])

# Mezclar y dividir el dataset
random.seed(SEED)
random.shuffle(text_pairs)

num_val_samples = int(VAL_SPLIT_RATIO * len(text_pairs))
num_test_samples = int(TEST_SPLIT_RATIO * len(text_pairs))
num_train_samples = len(text_pairs) - num_val_samples - num_test_samples

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"Muestras de entrenamiento: {len(train_pairs)}")
print(f"Muestras de validación: {len(val_pairs)}")
print(f"Muestras de prueba: {len(test_pairs)}")

In [None]:
# --- PASO 2: TOKENIZADOR Y MODELO ---
print(f"Cargando tokenizador y modelo {MODEL_NAME}...")
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# Añadir tokens especiales al tokenizador
special_tokens_dict = {
    'pad_token': '<|pad|>', # GPT-2 no tiene pad_token por defecto
    'sep_token': SEP_TOKEN,
    'additional_special_tokens': [START_SPA_TOKEN, END_SPA_TOKEN]
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"Añadidos {num_added_toks} tokens nuevos.")

# Redimensionar embeddings del modelo para incluir los nuevos tokens
model.resize_token_embeddings(len(tokenizer))
print(f"Tamaño del vocabulario del tokenizador: {len(tokenizer)}")
print(f"Embeddings del modelo redimensionados a: {model.get_input_embeddings().weight.shape[0]}")

# Verificar IDs de tokens especiales
print(f"ID de PAD: {tokenizer.pad_token_id}")
print(f"ID de SEP: {tokenizer.sep_token_id}")
print(f"ID de START_SPA: {tokenizer.convert_tokens_to_ids(START_SPA_TOKEN)}")
print(f"ID de END_SPA: {tokenizer.convert_tokens_to_ids(END_SPA_TOKEN)}")

In [None]:
# --- PASO 3: CREAR CLASE Dataset DE PYTORCH ---
class TranslationDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        eng_text, spa_text = self.pairs[idx]


        combined_text = f"{eng_text} {self.tokenizer.sep_token} {START_SPA_TOKEN} {spa_text} {END_SPA_TOKEN}"

        tokenized_output = self.tokenizer(
            combined_text,
            truncation=True,
            padding="max_length", # Pad to max_length
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = tokenized_output.input_ids.squeeze(0)
        attention_mask = tokenized_output.attention_mask.squeeze(0)

        labels = input_ids.clone()

        prompt_text = f"{eng_text} {self.tokenizer.sep_token} {START_SPA_TOKEN}"
        prompt_tokenized = self.tokenizer(prompt_text, add_special_tokens=False) # No añadir BOS/EOS aquí

        start_spa_token_id = self.tokenizer.convert_tokens_to_ids(START_SPA_TOKEN)

        try:
            input_ids_list = input_ids.tolist()
            idx_start_spa = input_ids_list.index(start_spa_token_id)
            labels[:idx_start_spa + 1] = -100
        except ValueError:
             labels[:] = -100


        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

train_dataset = TranslationDataset(train_pairs, tokenizer, MAX_LENGTH)
val_dataset = TranslationDataset(val_pairs, tokenizer, MAX_LENGTH)

sample = train_dataset[0]
print("\nEjemplo procesado del dataset:")
print(f"Input IDs: {sample['input_ids']}")
print(f"Decoded Input: {tokenizer.decode(sample['input_ids'], skip_special_tokens=False)}")
print(f"Labels: {sample['labels']}")

active_labels = sample['labels'][sample['labels'] != -100]
print(f"Decoded Active Labels: {tokenizer.decode(active_labels, skip_special_tokens=False)}")
print(f"Attention Mask: {sample['attention_mask']}")


train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


In [None]:
# --- PASO 4: ENTRENAMIENTO ---
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"\nUsando dispositivo: {device}")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

print("\nIniciando entrenamiento...")
for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    progress_bar_train = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS} [Training]")

    for batch in progress_bar_train:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        if loss is not None: # Asegurarse de que la pérdida no sea None
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            progress_bar_train.set_postfix({'loss': loss.item()})
        else:
            print("Advertencia: La pérdida es None. Verifica la preparación de labels.")


    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Training Loss: {avg_train_loss:.4f}")

    # Validación
    model.eval()
    total_val_loss = 0
    progress_bar_val = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS} [Validation]")
    with torch.no_grad():
        for batch in progress_bar_val:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            if loss is not None:
                 total_val_loss += loss.item()
                 progress_bar_val.set_postfix({'val_loss': loss.item()})


    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}")

print("Entrenamiento completado.")

In [None]:
# --- PASO 5: GUARDAR MODELO Y TOKENIZADOR ---
output_dir = "./gpt2-english-spanish-translator"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Modelo y tokenizador guardados en {output_dir}")

In [None]:
# --- PASO 6: FUNCIÓN DE TRADUCCIÓN Y PRUEBA ---
def translate_text(text_to_translate, model, tokenizer, device, max_generation_length=MAX_LENGTH):
    model.eval()

    prompt = f"{text_to_translate} {tokenizer.sep_token} {START_SPA_TOKEN}"

    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    eos_token_ids_list = [tokenizer.convert_tokens_to_ids(END_SPA_TOKEN), tokenizer.eos_token_id]

    eos_token_ids_list = [tid for tid in eos_token_ids_list if tid is not None]

    with torch.no_grad():
        generated_outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=input_ids.shape[-1] + max_generation_length, # Longitud total (prompt + generación)
            num_beams=5, # Beam search puede mejorar la calidad
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=eos_token_ids_list # Puede ser una lista de IDs
        )
    full_decoded_text = tokenizer.decode(generated_outputs[0], skip_special_tokens=False)

    generated_tokens_only = generated_outputs[0][input_ids.shape[-1]:]
    decoded_translation = tokenizer.decode(generated_tokens_only, skip_special_tokens=True)

    decoded_translation = decoded_translation.split(END_SPA_TOKEN)[0].strip()
    decoded_translation = decoded_translation.split(SEP_TOKEN)[0].strip()


    return decoded_translation.strip()

test_phrases_eng = [
    "Hello, how are you?",
    "My name is John.",
    "I love to learn deep learning.",
    "This is a test.",
    "What time is it?"
]

print("\n--- Probando traducciones ---")
for phrase in test_phrases_eng:
    translation = translate_text(phrase, model, tokenizer, device)
    print(f"Inglés: {phrase}")
    print(f"Español (modelo): {translation}\n")

# Probar con una entrada del usuario
user_input = input("Introduce una frase en inglés para traducir: ")
if user_input:
    translation = translate_text(user_input, model, tokenizer, device)
print(f"Traducción: {translation}")