In [33]:
import time
import json
import hashlib
import re
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [34]:
TOPIC_FALSE = "https://www.bbc.com/mundo/topics/c95y3rnvxkwt"
TOPIC_TRUE  = "https://www.bbc.com/mundo/topics/c7zp57yyz25t"

OUTDIR = Path("corpus_bbc_mundo")
OUTDIR.mkdir(exist_ok=True)
OUT_FALSE_FILE = OUTDIR / "Falsas_25.txt"
OUT_TRUE_FILE  = OUTDIR / "Verdaderas_25.txt"

LABEL_FALSE = 0
LABEL_TRUE  = 1

TARGET_PER_LABEL = 25
RATE_SLEEP = 0.8

# Robust defaults (ajusta si necesitas más)
MAX_PAGES = 40            # cuántas páginas ?page=N probar
SCROLL_PAUSES = 10        # cuántos ciclos de scroll por página
SCROLL_INTERVAL = 0.6     # segundos entre cada scroll paso
WAIT_TIMEOUT = 12         # segundos para WebDriverWait

# Recognize both /articles/ and /noticias- style links
LINK_PATTERNS = ["/articles/", "/noticias-"]

# Truncado y limpieza de boilerplate
TRUNCATE_PHRASES = [
    "haz clic", "suscríb", "recibe el mejor contenido", "también puedes seguirnos",
    "y recuerda", "descarga la última versión", "más leídas", "fin de", "newsletter"
]
REMOVE_LINE_PATTERNS = [
    r"^Fuente de la imagen[,;:]?.*$",
    r"^Imagen de .*$",
    r"^(También|Y) recuerda.*$",
    r"^(Haz clic|Haz clic para).*$",
    r"^(Suscríbete|Suscribete|Suscríbase).*$",
    r"^Recibe el mejor contenido.*$"
]

In [35]:
def create_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--lang=es-ES")
    # hacer más "humano"
    opts.add_argument("start-maximized")
    opts.add_argument("disable-infobars")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option('useAutomationExtension', False)
    # evita ciertos bloqueos básicos
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)
    driver.set_page_load_timeout(60)
    return driver

def short_id(url: str) -> str:
    return hashlib.sha1(url.encode("utf-8")).hexdigest()[:12]

def aggressive_scroll(driver):
    """Scroll incremental para forzar carga lazy."""
    try:
        for i in range(SCROLL_PAUSES):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight * arguments[0]);", (i+1)/SCROLL_PAUSES)
            time.sleep(SCROLL_INTERVAL)
    except WebDriverException:
        # fallback simple
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_INTERVAL)

def try_click_show_more(driver):
    """Intentar clicar botones 'mostrar/ver más' si existen (varios textos/selectores)."""
    candidates_xpaths = [
        "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'mostrar')]",
        "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'ver más')]",
        "//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'ver más')]",
        "//button[contains(., 'Load more') or contains(., 'See more') or contains(., 'Show more')]",
        "//a[contains(., 'Load more') or contains(., 'See more') or contains(., 'Show more')]"
    ]
    for xp in candidates_xpaths:
        try:
            elems = driver.find_elements(By.XPATH, xp)
            for e in elems:
                try:
                    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", e)
                    time.sleep(0.15)
                    driver.execute_script("arguments[0].click();", e)
                    time.sleep(0.4)
                except Exception:
                    continue
        except Exception:
            continue

def collect_links_on_page(driver):
    """Extrae enlaces desde DOM (JS) y via find_elements; normaliza."""
    hrefs = set()
    # 1) JS collection
    try:
        js = """
        return Array.from(document.querySelectorAll('a[href]')).map(a=>a.href);
        """
        all_hrefs = driver.execute_script(js)
        if all_hrefs:
            for h in all_hrefs:
                if not isinstance(h, str):
                    continue
                # normalizar
                h2 = re.split(r'[#?]', h)[0]
                for pat in LINK_PATTERNS:
                    if pat in h2:
                        hrefs.add(h2)
                        break
    except Exception:
        pass

    # 2) Selenium find_elements fallback
    try:
        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href]')
        for e in elems:
            try:
                h = e.get_attribute("href")
            except Exception:
                continue
            if not h:
                continue
            h2 = re.split(r'[#?]', h)[0]
            for pat in LINK_PATTERNS:
                if pat in h2:
                    hrefs.add(h2)
                    break
    except Exception:
        pass

    return list(hrefs)

def extract_article_from_html(html, url):
    soup = BeautifulSoup(html, "lxml")
    h1 = soup.find("h1")
    title = h1.get_text(strip=True) if h1 else ""
    desc = ""
    meta = soup.find("meta", {"name":"description"})
    if meta and meta.get("content"):
        desc = meta["content"].strip()
    else:
        og = soup.find("meta", {"property":"og:description"})
        if og and og.get("content"):
            desc = og["content"].strip()

    body_parts = []
    article_tag = soup.find("article")
    if article_tag:
        blocks = article_tag.find_all(lambda tag: tag.name=="div" and tag.get("data-component")=="text-block")
        if blocks:
            for b in blocks:
                for p in b.find_all("p"):
                    t = p.get_text(" ", strip=True)
                    if t:
                        body_parts.append(t)
        else:
            for p in article_tag.find_all("p"):
                t = p.get_text(" ", strip=True)
                if t:
                    body_parts.append(t)
    else:
        main = soup.find("main") or soup.find("div", {"role":"main"})
        if main:
            for p in main.find_all("p"):
                t = p.get_text(" ", strip=True)
                if t:
                    body_parts.append(t)
        else:
            for p in soup.find_all("p"):
                t = p.get_text(" ", strip=True)
                if t and len(t) > 30:
                    body_parts.append(t)

    body = "\n\n".join(body_parts).strip()
    if not body:
        return None

    # limpiar boilerplate
    body = clean_and_truncate_body(body)
    body = remove_unwanted_lines(body)
    body = re.sub(r'\n{3,}', '\n\n', body).strip()

    return {"url": url, "title": title, "description": desc, "body": body}

def extract_article(driver, url):
    """Carga la URL en driver y extrae con BeautifulSoup la info del artículo."""
    try:
        driver.get(url)
    except Exception as e:
        print(f"[WARN] fallo cargando {url}: {e}")
        return None
    time.sleep(0.5)
    try:
        driver.execute_script("window.scrollTo(0, 350);")
    except Exception:
        pass
    time.sleep(0.5)
    html = driver.page_source
    return extract_article_from_html(html, url)

def clean_and_truncate_body(body_text: str) -> str:
    lower = body_text.lower()
    truncate_at = None
    for phrase in TRUNCATE_PHRASES:
        idx = lower.find(phrase.lower())
        if idx != -1:
            if truncate_at is None or idx < truncate_at:
                truncate_at = idx
    return body_text[:truncate_at].strip() if truncate_at is not None else body_text

def remove_unwanted_lines(text: str) -> str:
    lines = text.splitlines()
    new_lines = []
    for L in lines:
        Ls = L.strip()
        if not Ls:
            continue
        skip = False
        for pat in REMOVE_LINE_PATTERNS:
            if re.match(pat, Ls, flags=re.IGNORECASE):
                skip = True
                break
        if skip:
            continue
        new_lines.append(Ls)
    return "\n".join(new_lines)

In [36]:
def gather_label(driver, base_topic_url, label_value, target_count):
    collected = []
    seen = set()

    # 1) probar ?page=N (muchas pages posibles)
    for p in range(1, MAX_PAGES+1):
        if len(collected) >= target_count:
            break
        page_url = f"{base_topic_url}?page={p}"
        print(f"[INFO] cargando {page_url}")
        try:
            driver.get(page_url)
        except Exception as e:
            print(f"[WARN] fallo página {page_url}: {e}")
            continue

        # scroll agresivo y click en botones 'mostrar más'
        aggressive_scroll(driver)
        try_click_show_more(driver)

        # esperar anchors renderizados (mínimo 1 anchor)
        try:
            WebDriverWait(driver, WAIT_TIMEOUT).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a[href]"))
            )
        except TimeoutException:
            print("  [WARN] timeout esperando anchors en la página")

        links = collect_links_on_page(driver)
        print(f"  [FOUND] {len(links)} links candidatos")
        for link in links:
            if len(collected) >= target_count:
                break
            if link in seen:
                continue
            seen.add(link)
            art = extract_article(driver, link)
            if not art:
                print(f"    [SKIP] sin cuerpo extraíble: {link}")
                continue
            id_ = short_id(link)
            collected.append({
                "id": id_, "label": label_value, "title": art["title"],
                "description": art["description"], "body": art["body"], "url": link
            })
            print(f"    [ADD] {len(collected)}/{target_count}: {art['title'][:90]}")
            time.sleep(RATE_SLEEP)

    # 2) si no alcanzó, intentar desde la landing sin ?page y clickear paginacion 'siguiente'
    if len(collected) < target_count:
        print("[INFO] intentando paginación por UI (botón 'Siguiente') como fallback")
        try:
            driver.get(base_topic_url)
            aggressive_scroll(driver)
            try_click_show_more(driver)
            # intentar iterar clicks en 'siguiente'
            for attempt in range(1, MAX_PAGES+1):
                # localizar un enlace visible que parezca 'Siguiente' o con 'page=' en href
                cand = driver.find_elements(By.XPATH, "//a[contains(@href,'?page=') or contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'siguiente') or contains(., 'Next') or contains(., 'next')]")
                if not cand:
                    break
                el = cand[-1]
                try:
                    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
                    time.sleep(0.2)
                    driver.execute_script("arguments[0].click();", el)
                except Exception:
                    break
                time.sleep(0.8)
                aggressive_scroll(driver)
                try_click_show_more(driver)
                links = collect_links_on_page(driver)
                print(f"  [FOUND after click] {len(links)} links")
                for link in links:
                    if len(collected) >= target_count:
                        break
                    if link in seen:
                        continue
                    seen.add(link)
                    art = extract_article(driver, link)
                    if not art:
                        print(f"    [SKIP] sin cuerpo extraíble: {link}")
                        continue
                    id_ = short_id(link)
                    collected.append({
                        "id": id_, "label": label_value, "title": art["title"],
                        "description": art["description"], "body": art["body"], "url": link
                    })
                    print(f"    [ADD] {len(collected)}/{target_count}: {art['title'][:90]}")
                    time.sleep(RATE_SLEEP)
                if len(collected) >= target_count:
                    break
        except Exception as e:
            print("[WARN] paginación UI fallback falló:", e)

    return collected[:target_count]

def save_as_json_txt(path: Path, objects):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(objects, f, ensure_ascii=False, indent=2)

In [None]:
def main():
    # set headless=False if quieres ver el navegador y debuggear
    driver = create_driver(headless=False)
    try:
        print("[RUN] Recolectando FALSAS...")
        falsos = gather_label(driver, TOPIC_FALSE, LABEL_FALSE, TARGET_PER_LABEL)
        print("[RUN] Recolectando VERDADERAS...")
        verdaderos = gather_label(driver, TOPIC_TRUE, LABEL_TRUE, TARGET_PER_LABEL)

        print(f"[RESULT] Falsas reunidas: {len(falsos)}; Verdaderas reunidas: {len(verdaderos)}")
        if len(falsos) < TARGET_PER_LABEL or len(verdaderos) < TARGET_PER_LABEL:
            print("[WARN] No se alcanzó la cantidad objetivo. Ajusta MAX_PAGES/SCROLL_PAUSES o ejecuta con headless=False para investigar.")
        save_as_json_txt(OUT_FALSE_FILE, falsos)
        save_as_json_txt(OUT_TRUE_FILE, verdaderos)
        print(f"[OK] Guardados: {OUT_FALSE_FILE}  y  {OUT_TRUE_FILE}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()
# test para subir

[RUN] Recolectando FALSAS...
[INFO] cargando https://www.bbc.com/mundo/topics/c95y3rnvxkwt?page=1
  [FOUND] 24 links candidatos
    [ADD] 1/25: Cómo el emoji de la zanahoria se convirtió en un código secreto en internet para camuflar 
    [ADD] 2/25: Cómo un viaje a Noruega me hizo ver que estaba equivocado y que la Tierra no es plana
    [ADD] 3/25: Quiénes son los "Ciudadanos del Reich", el grupo asociado con los golpistas arrestados en 
    [ADD] 4/25: "Mi foto fue usada para diseminar mentiras sobre la guerra": la verdadera historia de una 
    [ADD] 5/25: La matanza que los soviéticos atribuyeron con éxito a los nazis durante 50 años
    [ADD] 6/25: Las afirmaciones falsas y engañosas amplificadas por Elon Musk en Twitter
    [ADD] 7/25: Rusia invade Ucrania: cómo saber si lo que estás viendo sobre el conflicto es real o son "
    [ADD] 8/25: Donald Trump: cómo detectar imágenes creadas por inteligencia artificial como las fotos fa
    [ADD] 9/25: La princesa Kate pide disculpas p