## Imports

In [1]:
import re
import json
import time
import hashlib
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

## Setup

In [2]:
OUT_DIR = Path("data")
CACHE_DIR = OUT_DIR / "html_cache"
OUT_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; student-research; +https://example.com)",
    "Accept-Language": "ro-RO,ro;q=0.9,en-US;q=0.8,en;q=0.7",
}

REQUEST_TIMEOUT = 30
SLEEP_SECONDS = 1.0
MAX_RETRIES = 4

SEED_LISTING_URLS = [
    "https://www.factual.ro/declaratii/",
    "https://www.factual.ro/toate-declaratiile/",
    "https://www.factual.ro/dezinformari-retele-sociale/",
    "https://www.factual.ro/dezinformari-rs/",
]

## Label Normalization

In [3]:
CANONICAL_LABELS = [
    "ADEVĂRAT",
    "PARȚIAL ADEVĂRAT",
    "TRUNCHIAT",
    "S-A RĂZGÂNDIT",
    "IMPOSIBIL DE VERIFICAT",
    "FALS",
]

LIKERT_MAP = {
    "FALS": 0,
    "IMPOSIBIL DE VERIFICAT": 1,
    "S-A RĂZGÂNDIT": 2,
    "TRUNCHIAT": 3,
    "PARȚIAL ADEVĂRAT": 4,
    "ADEVĂRAT": 5,
}

def normalize_label(label: str) -> Optional[str]:
    if not label:
        return None
    s = label.strip().upper()
    s = s.replace("S-A RAZGANDIT", "S-A RĂZGÂNDIT").replace("S-A RĂZGANDIT", "S-A RĂZGÂNDIT")
    s = re.sub(r"\s+", " ", s)
    if s in CANONICAL_LABELS:
        return s
    s_ascii = (s.replace("Ă","A").replace("Â","A").replace("Î","I").replace("Ș","S").replace("Ş","S").replace("Ț","T").replace("Ţ","T"))
    for c in CANONICAL_LABELS:
        c_ascii = (c.replace("Ă","A").replace("Â","A").replace("Î","I").replace("Ș","S").replace("Ş","S").replace("Ț","T").replace("Ţ","T"))
        if s_ascii == c_ascii:
            return c
    return None

def split_title_into_label_and_claim(title: str) -> Tuple[Optional[str], str]:
    if not title:
        return None, ""
    t = " ".join(title.split())
    if "|" in t:
        left, right = [x.strip() for x in t.split("|", 1)]
        return normalize_label(left), right
    return None, t

## Utils

In [4]:
def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def cache_path_for_url(url: str) -> Path:
    return CACHE_DIR / f"{sha1(url)}.html"

def fetch_html(url: str, force: bool = False) -> str:
    cpath = cache_path_for_url(url)
    if cpath.exists() and not force:
        return cpath.read_text(encoding="utf-8", errors="ignore")

    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            html = resp.text
            cpath.write_text(html, encoding="utf-8")
            time.sleep(SLEEP_SECONDS)
            return html
        except Exception as e:
            last_err = e
            time.sleep(min(10, 2 ** (attempt - 1)))
    raise RuntimeError(f"Failed to fetch {url} after {MAX_RETRIES} retries. Last error: {last_err}")

In [13]:
DATE_VERIFY_RE = re.compile(r"Data verificării\s*:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})", flags=re.IGNORECASE)
DATE_FALLBACK_RE = re.compile(r"\b([0-9]{2}\.[0-9]{2}\.[0-9]{4})\b")

FOOTER_MARKERS = [
    "ALĂTURĂ-TE LISTEI NOASTRE",
    "TOATE VERIFICĂRILE",
    "TOȚI POLITICIENII",
    "Acest website folosește",
    "Privacy & Cookies Policy",
]

def extract_date_verified(text: str) -> Optional[str]:
    if not text:
        return None
    m = DATE_VERIFY_RE.search(text) or DATE_FALLBACK_RE.search(text)
    if not m:
        return None
    dd, mm, yyyy = m.group(1).split(".")
    return f"{int(yyyy):04d}-{int(mm):02d}-{int(dd):02d}"

def guess_article_type(url: str) -> str:
    u = url.lower()
    if "/declaratii/" in u:
        return "declaratie"
    if "/dezinformari" in u:
        return "dezinformare"
    return "unknown"

def find_main_container(soup: BeautifulSoup):
    article = soup.find("article")
    if article:
        return article
    entry = soup.find(class_=re.compile(r"(entry-content|post-content|content-area|site-content)"))
    if entry:
        return entry
    main = soup.find("main")
    if main:
        return main
    return soup.body or soup

def clean_text(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def extract_text_until_footer(container) -> str:
    for tag in container.find_all(["script", "style", "noscript"]):
        tag.decompose()

    chunks = []
    for el in container.find_all(["h1", "h2", "h3", "p", "li", "blockquote"]):
        t = clean_text(el.get_text(" ", strip=True))
        if not t:
            continue
        if any(marker in t for marker in FOOTER_MARKERS):
            break
        chunks.append(t)
    return "\n".join(chunks).strip()

def extract_outbound_links(container) -> List[str]:
    links = []
    for a in container.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("http") and "factual.ro" not in href:
            links.append(href)
    seen, out = set(), []
    for x in links:
        if x not in seen:
            seen.add(x); out.append(x)
    return out

def extract_speaker(container) -> Tuple[Optional[str], Optional[str]]:
    for a in container.find_all("a", href=True):
        href = a["href"]
        if re.search(r"factual\.ro/persoane/[^/]+/?$", href):
            name = clean_text(a.get_text(" ", strip=True))
            return (name or None), href
    return None, None

def extract_conclusion(container) -> Optional[str]:
    concl = None
    for h in container.find_all(["h2", "h3"]):
        if clean_text(h.get_text(" ", strip=True)).lower() == "concluzie":
            concl = h
            break
    if not concl:
        return None

    parts = []
    for sib in concl.find_all_next():
        if sib == concl:
            continue
        if sib.name in ("h2", "h3"):
            break
        if sib.name in ("p", "li", "blockquote"):
            t = clean_text(sib.get_text(" ", strip=True))
            if t and not any(marker in t for marker in FOOTER_MARKERS):
                parts.append(t)
    out = "\n".join(parts).strip()
    return out or None

def parse_factual_article(html: str, url: str) -> Dict:
    soup = BeautifulSoup(html, "lxml")
    h1 = soup.find("h1")
    title = clean_text(h1.get_text(" ", strip=True)) if h1 else None
    label, claim_from_title = split_title_into_label_and_claim(title or "")

    container = find_main_container(soup)
    full_text = extract_text_until_footer(container)

    rec = {
        "id": sha1(url),
        "url": url,
        "source": "factual.ro",
        "type": guess_article_type(url),
        "title": title,
        "claim": claim_from_title if claim_from_title else title,
        "label": label,
        "label_likert": LIKERT_MAP.get(label) if label in LIKERT_MAP else None,
        "date_verified": extract_date_verified(full_text),
        "speaker": None,
        "speaker_url": None,
        "text": full_text,
        "conclusion": extract_conclusion(container),
        "outbound_links": extract_outbound_links(container),
    }

    if rec["type"] == "declaratie":
        rec["speaker"], rec["speaker_url"] = extract_speaker(container)

    return rec

## Sanity check

In [14]:
EXAMPLE_URLS = [
    "https://www.factual.ro/declaratii/partial-adevarat-nicusor-dan-sustine-ca-ar-fi-obtinut-pentru-campania-electorala-585-milioane-de-lei-din-imprumuturi-si-25-milioane-de-lei-din-donatii/",
    "https://www.factual.ro/dezinformari-rs/fals-onu-ar-spune-ca-ucraina-nu-exista-in-mod-legal/",
]

records = []
for url in EXAMPLE_URLS:
    html = fetch_html(url)
    records.append(parse_factual_article(html, url))

df_example = pd.DataFrame(records)
df_example[["url","type","label","date_verified","speaker","claim"]]

Unnamed: 0,url,type,label,date_verified,speaker,claim
0,https://www.factual.ro/declaratii/partial-adev...,declaratie,PARȚIAL ADEVĂRAT,,Nicușor Dan,Nicușor Dan susține că ar fi obținut pentru ca...
1,https://www.factual.ro/dezinformari-rs/fals-on...,dezinformare,FALS,1991-12-25,,ONU ar spune că Ucraina nu există în mod legal


## Discover urls

In [15]:
ARTICLE_URL_RE = re.compile(
    r"^https?://(www\.)?factual\.ro/(declaratii|dezinformari[^/]*|dezinformari-rs|dezinformari-retele-sociale)/",
    re.IGNORECASE,
)

def extract_article_links_from_listing(html: str) -> List[str]:
    soup = BeautifulSoup(html, "lxml")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("/"):
            href = "https://www.factual.ro" + href
        if ARTICLE_URL_RE.match(href):
            links.append(href.split("#")[0])
    return list(dict.fromkeys(links))

def discover_urls(base_listing_url: str, max_pages: int = 50, try_page_pagination: bool = True) -> List[str]:
    found, seen_listing = [], set()

    def process_listing(url: str):
        if url in seen_listing:
            return
        seen_listing.add(url)
        html = fetch_html(url)
        found.extend(extract_article_links_from_listing(html))

    process_listing(base_listing_url)

    if try_page_pagination:
        base = base_listing_url.rstrip("/") + "/"
        for i in range(2, max_pages + 1):
            page_url = f"{base}page/{i}/"
            try:
                process_listing(page_url)
            except Exception:
                break

    return list(dict.fromkeys(found))

discovered = []
for seed in SEED_LISTING_URLS:
    try:
        discovered.extend(discover_urls(seed, max_pages=10, try_page_pagination=True))
    except Exception as e:
        print(f"[WARN] seed failed: {seed} -> {e}")

discovered = list(dict.fromkeys(discovered))
len(discovered), discovered[:10]

[WARN] seed failed: https://www.factual.ro/dezinformari-rs/ -> Failed to fetch https://www.factual.ro/dezinformari-rs/ after 4 retries. Last error: 404 Client Error: Not Found for url: https://www.factual.ro/dezinformari-rs/


(180,
 ['https://www.factual.ro/dezinformari-retele-sociale/',
  'https://www.factual.ro/dezinformari-rs/fals-cometa-atlas-ar-fi-o-nava-extraterestra/',
  'https://www.factual.ro/dezinformari-rs/fals-onu-ar-spune-ca-ucraina-nu-exista-in-mod-legal/',
  'https://www.factual.ro/dezinformari-rs/fals-uniunea-europeana-ne-ar-interzice-sa-folosim-apa-din-fantanile-proprii/',
  'https://www.factual.ro/dezinformari-rs/context-lipsa-nicusor-dan-ar-fi-recunoscut-ca-o-elita-conducatoare-a-anulat-alegerile-prezidentiale-din-2024/',
  'https://www.factual.ro/dezinformari-rs/fals-la-intalnirea-dintre-nicusor-dan-si-emmanuel-macron-s-ar-fi-servit-apa-imbuteliata-de-la-vidraru/',
  'https://www.factual.ro/dezinformari-rs/fals-o-femeie-ar-fi-primit-gratuit-un-avion-privat-la-aeroportul-din-cluj-napoca/',
  'https://www.factual.ro/dezinformari-rs/fals-diana-buzoianu-ar-fi-demisionat-de-la-ministerul-mediului/',
  'https://www.factual.ro/dezinformari-rs/informatii-false-despre-motivele-pentru-care-soldati

## Build and save dataset

In [17]:
def build_dataset(urls: List[str]) -> pd.DataFrame:
    rows = []
    for url in tqdm(urls, desc="Scraping articles"):
        try:
            html = fetch_html(url)
            rec = parse_factual_article(html, url)
            rows.append(rec)
        except Exception as e:
            rows.append({"id": sha1(url), "url": url, "error": str(e)})
    return pd.DataFrame(rows)

URLS_TO_SCRAPE = discovered
df = build_dataset(URLS_TO_SCRAPE)
df.head(3)

Scraping articles:   0%|          | 0/180 [00:00<?, ?it/s]

Unnamed: 0,id,url,source,type,title,claim,label,label_likert,date_verified,speaker,speaker_url,text,conclusion,outbound_links
0,0c733673e6e9d65e34e46f519d57ad08ec06870d,https://www.factual.ro/dezinformari-retele-soc...,factual.ro,dezinformare,,,,,,,,FALS | Cometa Atlas ar fi o navă extraterestră,,[]
1,6aa458d6b266fe9982cc841573653da4a9ebc352,https://www.factual.ro/dezinformari-rs/fals-co...,factual.ro,dezinformare,FALS | Cometa Atlas ar fi o navă extraterestră,Cometa Atlas ar fi o navă extraterestră,FALS,0.0,,,,Un utilizator de pe Facebook a distribuit o po...,Afirmația este falsă.,[https://www.facebook.com/permalink.php?story_...
2,ea359aa0cf50aeff8d56223b3802437c6003df60,https://www.factual.ro/dezinformari-rs/fals-on...,factual.ro,dezinformare,FALS | ONU ar spune că Ucraina nu există în mo...,ONU ar spune că Ucraina nu există în mod legal,FALS,0.0,1991-12-25,,,Un utilizator Facebook a postat un videoclip c...,Afirmațiile prezentate în articolul distribuit...,[https://www.infobrasov.net/mare-surpiza-dinsp...


In [18]:
raw_path = OUT_DIR / "factual_ro_raw.jsonl"
with raw_path.open("w", encoding="utf-8") as f:
    for rec in df.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

df_out = df.copy()
if "outbound_links" in df_out.columns:
    df_out["outbound_links"] = df_out["outbound_links"].apply(
        lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, list) else x
    )

csv_path = OUT_DIR / "factual_ro_dataset.csv"
df_out.to_csv(csv_path, index=False, encoding="utf-8")

raw_path, csv_path

(WindowsPath('data/factual_ro_raw.jsonl'),
 WindowsPath('data/factual_ro_dataset.csv'))

## Post processing

In [19]:
def label_group(label: Optional[str]) -> Optional[str]:
    if label in ("ADEVĂRAT", "PARȚIAL ADEVĂRAT"):
        return "TRUE"
    if label in ("FALS", "TRUNCHIAT"):
        return "FALSE"
    if label in ("IMPOSIBIL DE VERIFICAT", "S-A RĂZGÂNDIT"):
        return "OTHER"
    return None

def md5(s: str) -> str:
    return hashlib.md5((s or "").encode("utf-8")).hexdigest()

df_pp = df.copy()
df_pp["label_group"] = df_pp["label"].apply(label_group) if "label" in df_pp.columns else None
df_pp["text_len"] = df_pp["text"].fillna("").apply(len) if "text" in df_pp.columns else 0
df_pp["text_md5"] = df_pp["text"].fillna("").apply(md5) if "text" in df_pp.columns else None

df_pp_ok = df_pp[df_pp["error"].isna()] if "error" in df_pp.columns else df_pp.copy()
df_pp_ok = df_pp_ok[df_pp_ok["text_len"] >= 300].copy()
df_pp_ok = df_pp_ok.drop_duplicates(subset=["url"])
df_pp_ok = df_pp_ok.drop_duplicates(subset=["text_md5"])

df_pp_ok[["type","label","label_group","text_len"]].value_counts().head(20)

type          label     label_group  text_len
dezinformare  FALS      FALSE        2392        2
                                     3269        2
declaratie    ADEVĂRAT  TRUE         1234        1
dezinformare  FALS      FALSE        2054        1
                                     1736        1
                                     1750        1
                                     1779        1
                                     1893        1
                                     2003        1
                                     2016        1
                                     2123        1
                                     2417        1
                                     2133        1
                                     2159        1
                                     2176        1
                                     2210        1
                                     2222        1
                                     2322        1
                                    

## Drop unknowns and save final CSV

In [21]:
df_final = df_pp_ok[df_pp_ok["label"].notna()].copy()
pp_path = OUT_DIR / "factual_ro_dataset_postprocessed.csv"
df_final.drop(columns=["text_md5"], errors="ignore").to_csv(pp_path, index=False, encoding="utf-8")
pp_path

WindowsPath('data/factual_ro_dataset_postprocessed.csv')