# ENFOQUE ABSTRACTIVO

**Instalación y configuración**

In [1]:
!pip install transformers datasets accelerate sentencepiece

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [2]:
!pip install --upgrade datasets fsspec

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are i

**Carga de MLSum en español y particionado**

In [4]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# 1. Cargar datasets y unirlos
dataset = load_dataset("mlsum", "es", cache_dir="/content/mlsum_cache", download_mode="force_redownload")
train_df = pd.DataFrame(dataset["train"])
val_df = pd.DataFrame(dataset["validation"])
noticias_df = pd.read_csv("noticias_10.csv")
tsv_df = pd.read_csv("wikipedia_resumenes.tsv", sep="\t")

# Estandarizar nombres de columna
for df in [train_df, val_df, noticias_df, tsv_df]:
    df.rename(columns={"text": "texto", "summary": "resumen"}, inplace=True)

# Unir todo
combined_df = pd.concat([train_df[["texto", "resumen"]],
                         val_df[["texto", "resumen"]],
                         noticias_df[["texto", "resumen"]],
                         tsv_df[["texto", "resumen"]]],
                        ignore_index=True)

# Limpieza básica
combined_df.dropna(subset=["texto"], inplace=True)
combined_df = combined_df[combined_df["texto"].str.strip() != ""]

print(f"Dataset combinado listo con {combined_df.shape[0]} registros.")
combined_df.head(3)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


mlsum.py:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/77.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/266367 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10358 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/13920 [00:00<?, ? examples/s]

Dataset combinado listo con 276745 registros.


Unnamed: 0,texto,resumen
0,"De momento, no podemos responder a la pregunta...",Sofres no ofrece datos por ser festivo.- Telec...
1,Los vuelos han venido registrando este viernes...,El aeropuerto ha estado hasta las 15.00 con só...
2,El Gobierno turco ha anunciado que emprenderá ...,"El origen de la leyenda, el san Nicolás histór..."


#Enfoque abstractivo  - Fine-tuning T5

Convertir a Dataset de Hugging Face y dividir en Train / Validation

In [None]:
from datasets import Dataset

dataset_hf = Dataset.from_pandas(combined_df)
split = dataset_hf.train_test_split(test_size=0.1, seed=42)
train_ds = split['train']
val_ds = split['test']


**Tokenización y formato para Seq2Seq**

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

max_input_length = 1024
max_target_length = 256

def preprocess(batch):
    inputs = tokenizer(
        batch["texto"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["resumen"],
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )
    batch["input_ids"] = inputs["input_ids"]
    batch["attention_mask"] = inputs["attention_mask"]
    batch["labels"] = labels["input_ids"]
    return batch

train_ds = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_ds = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/249061 [00:00<?, ? examples/s]



Map:   0%|          | 0/27674 [00:00<?, ? examples/s]

**DataCollator y métricas de evaluación**

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9d958268598d54ee4849d49a37f2b862d2e3bd5729bc92da978fb4d7889f87ed
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from rouge_score import rouge_scorer
import numpy as np

data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=True)
def compute_metrics(pred):
    decoded_preds = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
    agg = {k: [] for k in ["rouge1","rouge2","rougeL"]}
    for p, l in zip(decoded_preds, decoded_labels):
        scores = scorer.score(l, p)
        for k,v in scores.items():
            agg[k].append(v.fmeasure)
    return {k: float(np.mean(v)) for k,v in agg.items()}




**Argumentos de entrenamiento y Trainer**

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM

# Argumentos de entrenamiento
training_args = Seq2SeqTrainingArguments(
    output_dir="./finetuned-model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=50,
    logging_first_step=True,
    save_total_limit=4,
    fp16=True,
    disable_tqdm=False,
    report_to=["none"],
    eval_steps=500,
    save_steps=500
)

# Cargar el modelo base
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Crear el trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.args.load_best_model_at_end = True
trainer.args.metric_for_best_model = "eval_loss"
trainer.args.greater_is_better = False


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,16.6071
50,5.6697
100,0.7573
150,0.5835
200,0.546
250,0.533
300,0.5084
350,0.5033
400,0.503
450,0.5038


TrainOutput(global_step=38915, training_loss=0.41030168549254487, metrics={'train_runtime': 28199.1042, 'train_samples_per_second': 44.161, 'train_steps_per_second': 1.38, 'total_flos': 3.370476432449864e+17, 'train_loss': 0.41030168549254487, 'epoch': 4.999453955609803})

Guardar, comprimir y descargar modelo

In [None]:
# Guarda el modelo y tokenizer en el directorio local de Colab
output_dir = "/content/finetuned_mlsum_es"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# Comprime todo en un ZIP
!zip -r finetuned_mlsum_es.zip finetuned_mlsum_es

# Descarga el ZIP a tu máquina
from google.colab import files
files.download("finetuned_mlsum_es.zip")


  adding: finetuned_mlsum_es/ (stored 0%)
  adding: finetuned_mlsum_es/model.safetensors (deflated 8%)
  adding: finetuned_mlsum_es/spiece.model (deflated 48%)
  adding: finetuned_mlsum_es/generation_config.json (deflated 29%)
  adding: finetuned_mlsum_es/tokenizer_config.json (deflated 95%)
  adding: finetuned_mlsum_es/config.json (deflated 63%)
  adding: finetuned_mlsum_es/tokenizer.json (deflated 74%)
  adding: finetuned_mlsum_es/special_tokens_map.json (deflated 85%)
  adding: finetuned_mlsum_es/training_args.bin (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>