In [None]:
import argparse
import hashlib
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Tuple

import pandas as pd
import sqlalchemy as sa
from newspaper import Article
from sentence_transformers import SentenceTransformer
from sqlalchemy import orm
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import declarative_base
from pgvector.sqlalchemy import Vector
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from bs4 import BeautifulSoup
import requests, re
import torch
from transformers import logging as hf_logging
import feedparser
hf_logging.set_verbosity_error()

# fuera de las funciones, para que se cargue una vez
_SUMMARIZER = pipeline(
    task="summarization",
    model="facebook/bart-large-cnn",   # o t5-small / pegasus
    device=0 if torch.cuda.is_available() else -1,
)
# tokenizer para contar tokens (opcional, si quieres trocear artículos muy largos)
_TOKENIZER = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

MAX_TOKENS = 1024 

EMB_MODEL = "sentence-transformers/all-mpnet-base-v2"  # 768 d
embedder = SentenceTransformer(EMB_MODEL)

EMB_DIM = 768

In [None]:
FEEDS: List[Tuple[str, str]] = [
    ("as_la_liga", "https://feeds.as.com/mrss-s/pages/as/site/as.com/section/futbol/subsection/primera/"),
    ("as_la_liga_hypermotion", "https://feeds.as.com/mrss-s/pages/as/site/as.com/section/futbol/subsection/segunda/"),
    ("as_champions_league", "https://feeds.as.com/mrss-s/pages/as/site/as.com/section/futbol/subsection/champions/"),
    ("marca_primera_division", "https://e00-marca.uecdn.es/rss/futbol/primera-division.xml"),
    ("marca_segunda_division", "https://e00-marca.uecdn.es/rss/futbol/segunda-division.xml"),
    ("marca_champions_league", "https://e00-marca.uecdn.es/rss/futbol/champions-league.xml"),
    ("marca_premier_league", "https://e00-marca.uecdn.es/rss/futbol/premier-league.xml"),
    ("marca_bundesliga", "https://e00-marca.uecdn.es/rss/futbol/bundesliga.xml"),
    ("marca_seria_a", "https://e00-marca.uecdn.es/rss/futbol/liga-italiana.xml"),
    ("marca_ligue_1", "https://e00-marca.uecdn.es/rss/futbol/liga-francesa.xml"),
    ("marca_america", "https://e00-marca.uecdn.es/rss/futbol/america.xml"),
    ("transfermarkt_es","https://www.transfermarkt.es/rss/news"),
    ("transfermarkt_uk","https://www.transfermarkt.co.uk/rss/news"),
    ("transfermarkt_it","https://www.transfermarkt.it/rss/news"),
    ("transfermarkt_de","https://www.transfermarkt.de/rss/news"),
    ("transfermarkt_pt","https://www.transfermarkt.pt/rss/news"),
]

In [None]:

def fetch_rss_items() -> List[dict]:
    """Return list of dicts with keys: source, title, url, published_at (UTC)."""

    items: List[dict] = []
    now = datetime.now(tz=timezone.utc)

    for source_id, feed_url in FEEDS:
        try:
            parsed = feedparser.parse(feed_url)
        except Exception as exc:
            print(f"[feed-error] {source_id}: {exc}")
            continue

        for entry in parsed.entries:
            # Robust date handling ------------------------------------------------
            if hasattr(entry, "published_parsed") and entry.published_parsed:
                published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
            elif hasattr(entry, "updated_parsed") and entry.updated_parsed:
                published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
            else:
                published_at = now

            items.append(
                {
                    "source": source_id,
                    "title": entry.title,
                    "url": entry.link,
                    "published_at": published_at,
                }
            )
    return items


# ----------------- Article parsing & embeddings ------------------

def safe_summarize(text: str) -> str:
    """
    Resume un texto con ajuste automático de longitudes y
    fallback si el modelo falla.
    """
    try:
        # tokens reales del chunk
        n_tokens = len(_TOKENIZER(text).input_ids)

        # Queremos algo más corto que el original pero > min_length
        max_len = max(20, int(n_tokens * 0.8))    # 80 % del tamaño
        max_len = min(max_len, 128)               # nunca > 128
        min_len = max(10, int(max_len * 0.25))    # 25 % del max_len

        return _SUMMARIZER(
            text,
            max_length=max_len,
            min_length=min_len,
            do_sample=False,
        )[0]["summary_text"]

    except Exception:
        # fallback: primeros 400 caracteres
        return text[:400] + "…"

def parse_article(url: str) -> tuple[str, str] | None:
    try:
        html = requests.get(url, timeout=10).text
    except requests.RequestException:
        return None

    soup = BeautifulSoup(html, "lxml")
    text = re.sub(r"\s+", " ", soup.get_text(" ", strip=True))

    if len(text.split()) < 20:
        return None

    # Split by tokens ≤1024 para BART
    tokens = _TOKENIZER(text).input_ids
    chunks = []
    while tokens:
        chunk_ids, tokens = tokens[:1024], tokens[1024:]
        chunks.append(_TOKENIZER.decode(chunk_ids, skip_special_tokens=True))

    # Resumen jerárquico
    summaries = [safe_summarize(c) for c in chunks]
    full_summary = safe_summarize(" ".join(summaries))
    return text, full_summary


def embed_texts(texts: list[str]) -> list[list[float]]:
    # Filtra nulos y vacíos
    valid_texts = [t for t in texts if t]
    if not valid_texts:
        return []

    return embedder.encode(
        valid_texts,
        batch_size=32,
        show_progress_bar=True,
        convert_to_numpy=True,
    ).tolist()


def ingest_news(engine: sa.Engine):
    items = sorted(fetch_rss_items(), key=lambda x: x["published_at"], reverse=True)
    print(f"Fetched {len(items)} RSS items → processing …")

    texts:      list[str]  = []   # artículo completo
    summaries:  list[str]  = []   # resumen
    metas:      list[dict] = []   # metadatos URL, título, fecha…

    for meta in items:
        try:
            print(f"Parsing article: {meta['url']}")
            parsed = parse_article(meta["url"])

            # ── descarta los que no devuelven nada ────────────────────────
            if parsed is None:
                continue

            text, summary = parsed
            texts.append(text)
            summaries.append(summary)
            metas.append(meta)

        except Exception as exc:
            print(f"[article-error] {meta['url']}: {exc}")

    if not summaries:
        print("No articles parsed, skipping embeddings.")
        return

    # Usa RESÚMENES (o texts) para la embedding; los dos tienen la misma len
    embeddings = embed_texts(texts)

    with orm.Session(engine) as session:
        inserted = 0
        for text, summary, emb, meta in zip(texts, summaries, embeddings, metas):
            if session.query(FootballNews).filter_by(url=meta["url"]).first():
                continue  # duplicado

            session.add(
                FootballNews(
                    url         = meta["url"],
                    title       = meta["title"],
                    published_at= meta["published_at"],
                    article_text= text,
                    summary     = summary,
                    embedding   = list(map(float, emb)),
                    article_meta= {"source": meta["source"]},
                )
            )
            inserted += 1
        session.commit()

    print(f"✅ News upserted: {inserted}")

In [None]:
items =fetch_rss_items()

In [None]:
len(items)

In [None]:
items = sorted(items, key=lambda x: x["published_at"], reverse=True)[:1]
print(f"Fetched {len(items)} RSS items → processing …")

In [None]:
summaries: List[str] = []
metas: List[dict] = []

In [None]:
for meta in items:
    try:
        print(f"Parsing article: {meta['url']}")
        parsed = parse_article(meta["url"])
        if parsed is None:
            continue
        text, summary = parsed
        summaries.append(summary)
        metas.append(meta)
    except Exception as exc:
        print(f"[article-error] {meta['url']}: {exc}")

In [None]:
summary

In [None]:
metas