# Pruebas de resumen

Ejecuta los resumidores definidos en `src/summarization/summarizer.py` contra los artículos de `data_example/` y guarda los textos resultantes dentro de cada archivo JSON para dejarlos listos para la etapa de recomendación.


In [1]:
from pathlib import Path
import json
from pprint import pprint

PROJECT_ROOT = Path("/home/ari/Collage/04-Forth_Year/Preimer_Semestre/PL/Final_Proj/Report_Generator")
DATA_DIR = PROJECT_ROOT / "data_example"

article_paths = sorted(DATA_DIR.glob("article_*.json"))
print(f"Artículos detectados: {len(article_paths)}")

def load_article(path: Path) -> dict:
    with open(path, encoding="utf-8") as fh:
        return json.load(fh)


def persist_article(path: Path, data: dict) -> None:
    with open(path, "w", encoding="utf-8") as fh:
        json.dump(data, fh, ensure_ascii=False, indent=2)


def update_article(path: Path, updates: dict) -> dict:
    data = load_article(path)
    data.update(updates)
    persist_article(path, data)
    return data


Artículos detectados: 0


In [3]:
from nltk.tokenize import sent_tokenize
from summarizer import TextRankSummarizer, PersonalizedSummarizer

base_summarizer = TextRankSummarizer(language="spanish")
personalized_summarizer = PersonalizedSummarizer(base_summarizer)

summary_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue

    extractive = base_summarizer.summarize(text, num_sentences=3)

    tag_categories = [tag.replace(" ", "_") for tag in article.get("tags", [])]
    personalized = personalized_summarizer.summarize_for_profile(
        text,
        user_categories=tag_categories,
        num_sentences=3
    )

    sentence_count = len(sent_tokenize(extractive, language="spanish"))

    updates = {
        "summaries": {
            "extractive": extractive,
            "personalized": personalized,
            "sentence_count": sentence_count
        }
    }

    update_article(path, updates)
    summary_results[path.name] = updates["summaries"]

pprint(summary_results.get("article_1.json"))


None


In [4]:
mock_profile = [
    "crisis_humanitaria",
    "derechos_humanos",
    "operacion_militar",
    "ayuda_refugiados"
]

mock_profile_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue

    mock_summary = personalized_summarizer.summarize_for_profile(
        text,
        user_categories=mock_profile,
        num_sentences=2
    )

    summaries = article.get("summaries", {})
    summaries["mock_profile"] = {
        "categories": mock_profile,
        "summary": mock_summary
    }
    update_article(path, {"summaries": summaries})

    mock_profile_results[path.name] = summaries["mock_profile"]

pprint(mock_profile_results.get("article_1.json"))


None


In [5]:
from nltk.tokenize import sent_tokenize
from summarizer import TextRankSummarizer, PersonalizedSummarizer

base_summarizer = TextRankSummarizer(language="spanish")
personalized_summarizer = PersonalizedSummarizer(base_summarizer)

mock_profile = [
    "crisis_humanitaria",
    "derechos_humanos",
    "operacion_militar",
    "ayuda_refugiados"
]

summary_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue

    extractive = base_summarizer.summarize(text, num_sentences=3)

    tag_categories = [tag.replace(" ", "_") for tag in article.get("tags", [])]
    personalized = personalized_summarizer.summarize_for_profile(
        text,
        user_categories=mock_profile,
        num_sentences=2
    )

    sentence_count = len(sent_tokenize(extractive, language="spanish"))

    updates = {
        "summaries": {
            "extractive": extractive,
            "personalized": personalized,
            "sentence_count": sentence_count
        }
    }

    update_article(path, updates)
    summary_results[path.name] = updates["summaries"]

pprint(summary_results.get("article_1.json"))


None
