#COMPARATIVA DE MODELOS Y ENFOQUES

In [46]:
!pip install transformers gradio rouge-score bert-score




In [47]:
!pip install sumy



##Cargar modelos

In [48]:
import zipfile
import os

os.makedirs("modelo_abs_1", exist_ok=True)
os.makedirs("modelo_abs_2", exist_ok=True)

with zipfile.ZipFile("modelo_abs_1.zip", 'r') as zip_ref:
    zip_ref.extractall("modelo_abs_1")

with zipfile.ZipFile("modelo_abs_2.zip", 'r') as zip_ref:
    zip_ref.extractall("modelo_abs_2")


In [49]:
import json

def corregir_config_modelo(ruta, tipo="t5"):
    ruta_config = os.path.join(ruta, "config.json")
    try:
        with open(ruta_config, "r") as f:
            config = json.load(f)
        if "model_type" not in config:
            config["model_type"] = tipo
            with open(ruta_config, "w") as f:
                json.dump(config, f, indent=2)
            print(f"✅ Corregido: {ruta}")
        else:
            print(f"Ya tenía model_type: {ruta}")
    except FileNotFoundError:
        print(f"No se encontró config.json en {ruta}")


In [50]:
corregir_config_modelo("modelo_abs_1/finetuned_mlsum_es", tipo="t5")
corregir_config_modelo("modelo_abs_2/finetuned_mlsum_es", tipo="t5")



Ya tenía model_type: modelo_abs_1/finetuned_mlsum_es
Ya tenía model_type: modelo_abs_2/finetuned_mlsum_es


In [51]:
from transformers import pipeline

abstractivo_modelo_1 = pipeline(
    "summarization",
    model="modelo_abs_1/finetuned_mlsum_es",
    tokenizer="modelo_abs_1/finetuned_mlsum_es"
)

abstractivo_modelo_2 = pipeline(
    "summarization",
    model="modelo_abs_2/finetuned_mlsum_es",
    tokenizer="modelo_abs_2/finetuned_mlsum_es"
)
# Modelo 3: Preentrenado sin fine-tuning (XLSum)
abstractivo_modelo_3 = pipeline(
    "summarization",
    model="csebuetnlp/mt5_multilingual_XLSum",
    tokenizer="csebuetnlp/mt5_multilingual_XLSum",
    framework="pt",
    device=0
)

Device set to use cpu
Device set to use cpu
Device set to use cpu


## Enfoque extractivo

##Métricas

In [52]:
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np
import re
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import nltk
import networkx as nx
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download("stopwords")

LANGUAGE = "spanish"


scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def limpieza_basica(texto):
    texto = re.sub(r'\[\d+\]', '', texto)
    texto = re.sub(r'==.*?==', '', texto)
    texto = re.sub(r'\n+', ' ', texto)
    return texto.strip()

def calcular_rouge(texto, resumen):
    scores = scorer.score(texto, resumen)
    return "\n".join([f"{k}: {round(v.fmeasure, 3)}" for k, v in scores.items()])

def calcular_bertscore(texto, resumen):
    P, R, F1 = bert_score([resumen], [texto], lang='es', model_type='xlm-roberta-large')
    return f"Precision: {P[0]:.3f}\nRecall: {R[0]:.3f}\nF1: {F1[0]:.3f}"

def calcular_compresion(texto, resumen):
    len_texto = len(texto.split())
    len_resumen = len(resumen.split())
    return f"{len_resumen / len_texto:.2f}" if len_texto > 0 else "0.00"

# Resumen extractivo con TF-IDF
def resumen_extractivo_tfidf(texto, n_frases=3):
    frases = sent_tokenize(texto, language='spanish')
    if len(frases) <= n_frases:
        return " ".join(frases)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(frases)
    scores = np.sum(tfidf_matrix.toarray(), axis=1)
    frases_idx = np.argsort(scores)[-n_frases:]
    resumen = [frases[i] for i in sorted(frases_idx)]
    return " ".join(resumen)

# Resumen extractivo con enfoque LEAD-3
def resumen_lead(texto, n_frases=3):
    frases = sent_tokenize(texto, language="spanish")
    resumen = frases[:n_frases]
    return " ".join(resumen)

def resumen_luhn(texto, n_frases=3):
    frases = sent_tokenize(texto, language="spanish")
    if len(frases) <= n_frases:
        return " ".join(frases)

    stopwords_es = stopwords.words("spanish")
    vectorizer = CountVectorizer(stop_words=stopwords_es)
    X = vectorizer.fit_transform(frases)
    palabra_freq = X.toarray().sum(axis=0)
    importantes = np.argsort(palabra_freq)[-20:]
    vocabulario_importante = set(np.array(vectorizer.get_feature_names_out())[importantes])

    puntuaciones = []
    for frase in frases:
        palabras = re.findall(r"\b\w+\b", frase.lower())
        bloques = []
        bloque = []
        for palabra in palabras:
            if palabra in vocabulario_importante:
                bloque.append(palabra)
            elif bloque:
                bloques.append(bloque)
                bloque = []
        if bloque:
            bloques.append(bloque)
        puntaje = max([(len(b)**2) / (len(b) + 1) for b in bloques], default=0)
        puntuaciones.append(puntaje)

    idx_top = np.argsort(puntuaciones)[-n_frases:]
    resumen = [frases[i] for i in sorted(idx_top)]
    return " ".join(resumen)


def resumen_textrank(texto, n_frases=3):
    parser = PlaintextParser.from_string(texto, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    resumen = summarizer(parser.document, n_frases)
    return " ".join(str(oracion) for oracion in resumen)

def resumen_textrank_tfidf(texto, n_frases=3):
    frases = sent_tokenize(texto, language="spanish")
    if len(frases) <= n_frases:
        return " ".join(frases)

    # Vectorización TF-IDF por frase
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(frases)

    # Calcular matriz de similitud (coseno)
    sim_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()

    # Crear grafo con NetworkX
    grafo = nx.from_numpy_array(sim_matrix)
    puntuaciones = nx.pagerank(grafo)

    # Seleccionar las frases con mayor puntaje
    frases_idx = sorted(puntuaciones, key=puntuaciones.get, reverse=True)[:n_frases]
    resumen = [frases[i] for i in sorted(frases_idx)]
    return " ".join(resumen)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
def resumen_comparado(texto):
    texto_limpio = limpieza_basica(texto)

    resumen_1 = abstractivo_modelo_1(texto_limpio, max_length=256, min_length=60, do_sample=False)[0]['summary_text']
    resumen_2 = abstractivo_modelo_2(texto_limpio, max_length=256, min_length=60, do_sample=False)[0]['summary_text']
    resumen_3 = abstractivo_modelo_3(texto_limpio, max_length=256, min_length=60, do_sample=False)[0]['summary_text']
    resumen_ext = resumen_extractivo_tfidf(texto_limpio, n_frases=3)
    resumen_lead3 = resumen_lead(texto_limpio, n_frases=3)
    resumen_luhn_ = resumen_luhn(texto_limpio, n_frases=3)
    resumen_textrank_ = resumen_textrank(texto_limpio, n_frases=3)
    resumen_textrank_tfidf_ = resumen_textrank_tfidf(texto_limpio, n_frases=3)

    resultados = []
    for resumen in [resumen_1, resumen_2, resumen_3, resumen_ext, resumen_lead3, resumen_luhn_, resumen_textrank_, resumen_textrank_tfidf_]:
        rouge = calcular_rouge(texto_limpio, resumen)
        bert = calcular_bertscore(texto_limpio, resumen)
        comp = calcular_compresion(texto_limpio, resumen)
        resultados.extend([resumen, rouge, bert, f"Tasa de compresión: {comp}"])

    return tuple(resultados)


##Interfaz Gradio

In [54]:
!pip install gradio




In [55]:
import gradio as gr
iface = gr.Interface(
    fn=resumen_comparado,
    inputs=gr.Textbox(lines=15, placeholder="Introduce aquí el texto largo a resumir..."),
    outputs=[
        gr.Textbox(label="Resumen Modelo 1"),
        gr.Textbox(label="ROUGE Modelo 1"),
        gr.Textbox(label="BERTScore Modelo 1"),
        gr.Textbox(label="Compresión Modelo 1"),

        gr.Textbox(label="Resumen Modelo 2"),
        gr.Textbox(label="ROUGE Modelo 2"),
        gr.Textbox(label="BERTScore Modelo 2"),
        gr.Textbox(label="Compresión Modelo 2"),

        gr.Textbox(label="Resumen Modelo 3 (XLSum)"),
        gr.Textbox(label="ROUGE Modelo 3"),
        gr.Textbox(label="BERTScore Modelo 3"),
        gr.Textbox(label="Compresión Modelo 3"),

        gr.Textbox(label="Resumen Extractivo LEAD-3"),
        gr.Textbox(label="ROUGE LEAD-3"),
        gr.Textbox(label="BERTScore LEAD-3"),
        gr.Textbox(label="Compresión LEAD-3"),

        gr.Textbox(label="Resumen Extractivo Luhn"),
        gr.Textbox(label="ROUGE Luhn"),
        gr.Textbox(label="BERTScore Luhn"),
        gr.Textbox(label="Compresión Luhn"),

        gr.Textbox(label="Resumen Extractivo TF-IDF"),
        gr.Textbox(label="ROUGE TF-IDF"),
        gr.Textbox(label="BERTScore TF-IDF"),
        gr.Textbox(label="Compresión TF-IDF"),

        gr.Textbox(label="Resumen Extractivo TextRank"),
        gr.Textbox(label="ROUGE TextRank"),
        gr.Textbox(label="BERTScore TextRank"),
        gr.Textbox(label="Compresión TextRank"),

        gr.Textbox(label="Resumen Extractivo TextRank-TFIDF"),
        gr.Textbox(label="ROUGE TextRank-TFIDF"),
        gr.Textbox(label="BERTScore TextRank-TFIDF"),
        gr.Textbox(label="Compresión TextRank-TFIDF"),

    ],
    title="Comparador de Resúmenes Abstractivos y Extractivos",
    description="Introduce un texto en español y compara cómo resumen tres modelos abstractivos y dos enfoques extractivos (TF-IDF y LEAD-3)."
)

iface.launch(debug=True)



It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7561b2fc1d72a21c2f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://7561b2fc1d72a21c2f.gradio.live


