## Imports

In [1]:
import re
import time
import json
import hashlib
from pathlib import Path
from typing import List, Dict, Optional, Any, Tuple

import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

## Setup

In [2]:
pd.set_option("display.max_colwidth", 120)

BASE = "https://www.veridica.ro"
LISTING_URL = f"{BASE}/baza-de-date"

OUT_DIR = Path("data_veridica")
OUT_DIR.mkdir(exist_ok=True, parents=True)

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; veridica-scraper/1.0; +https://example.com)",
    "Accept-Language": "ro-RO,ro;q=0.9,en;q=0.7",
}

REQUEST_DELAY_SEC = 0.8
TIMEOUT = 30
MAX_RETRIES = 4

## Utilities

In [3]:
def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def norm_ws(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip())

def fetch_html(url: str, session: Optional[requests.Session] = None, cache_dir: Path = OUT_DIR / "cache") -> str:
    cache_dir.mkdir(exist_ok=True, parents=True)
    cache_path = cache_dir / f"{sha1(url)}.html"
    if cache_path.exists():
        return cache_path.read_text(encoding="utf-8", errors="ignore")

    sess = session or requests.Session()
    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = sess.get(url, headers=HEADERS, timeout=TIMEOUT)
            r.raise_for_status()
            html = r.text
            cache_path.write_text(html, encoding="utf-8")
            time.sleep(REQUEST_DELAY_SEC)
            return html
        except Exception as e:
            last_err = e
            time.sleep(REQUEST_DELAY_SEC * attempt)
    raise RuntimeError(f"Failed to fetch {url} after {MAX_RETRIES} retries. Last error: {last_err}")

In [4]:
def get_last_page_from_listing(html: str) -> int:
    soup = BeautifulSoup(html, "lxml")
    nums = []
    for a in soup.find_all("a", href=True):
        txt = (a.get_text() or "").strip()
        if txt.isdigit():
            nums.append(int(txt))
    return max(nums) if nums else 1

def parse_listing_page(html: str, page_url: str) -> List[Dict[str, Any]]:
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("table")
    trs = table.find_all("tr") if table else soup.find_all("tr")

    rows = []
    for tr in trs:
        tds = tr.find_all(["td", "th"])
        if len(tds) < 2:
            continue

        header_like = norm_ws(" ".join(td.get_text(" ", strip=True) for td in tds)).lower()
        if "data" in header_like and "titlu" in header_like:
            continue

        date_txt = norm_ws(tds[0].get_text(" ", strip=True))

        a_title = tds[1].find("a", href=True)
        if not a_title:
            continue
        href = a_title["href"].strip()
        if href.startswith("/"):
            href = BASE + href
        title = norm_ws(a_title.get_text(" ", strip=True))

        sources = []
        if len(tds) >= 3:
            for a in tds[2].find_all("a", href=True):
                sources.append({"title": norm_ws(a.get_text(" ", strip=True)), "url": a["href"].strip()})

        country = None
        if len(tds) >= 4:
            country = norm_ws(tds[3].get_text(" ", strip=True)) or None

        rows.append({
            "url": href,
            "listing_date": date_txt or None,
            "listing_country": country,
            "listing_sources": sources,
            "listing_page": page_url,
            "listing_title": title,
        })

    dedup, seen = [], set()
    for r in rows:
        if r["url"] in seen:
            continue
        seen.add(r["url"])
        dedup.append(r)
    return dedup

def discover_listing_index(max_pages: Optional[int] = None) -> pd.DataFrame:
    sess = requests.Session()
    html0 = fetch_html(LISTING_URL, sess)
    last_page = get_last_page_from_listing(html0)
    if max_pages is not None:
        last_page = min(last_page, max_pages)

    all_rows = []
    for p in tqdm(range(1, last_page + 1), desc="Listing pages"):
        url = LISTING_URL if p == 1 else f"{LISTING_URL}?page={p}"
        html = html0 if p == 1 else fetch_html(url, sess)
        all_rows.extend(parse_listing_page(html, url))

    df_index = pd.DataFrame(all_rows).drop_duplicates(subset=["url"]).reset_index(drop=True)
    return df_index

## List discoveries

In [5]:
df_index = discover_listing_index(max_pages=None)
df_index.head(3), len(df_index)

Listing pages:   0%|          | 0/210 [00:00<?, ?it/s]

(                                                                                                                       url  \
 0  https://www.veridica.ro/fake-news-dezinformare-propaganda/fake-news-europarlamentarii-romani-au-votat-pentru-inarmar...   
 1  https://www.veridica.ro/fake-news-dezinformare-propaganda/republica-moldova-2025-top-fake-news-dezinformari-demontat...   
 2  https://www.veridica.ro/fake-news-dezinformare-propaganda/propaganda-de-razboi-ucraina-e-o-dictatura-si-nu-poate-org...   
 
   listing_date listing_country  \
 0   22.12.2025         România   
 1   22.12.2025            None   
 2   21.12.2025         Ucraina   
 
                                                                                                            listing_sources  \
 0                                                                                                                       []   
 1                                                                                            

## Set labels from Veridica

In [7]:
LABEL_RE = re.compile(r"^\s*([A-ZĂÂÎȘȚ\- ]{3,}?)\s*:\s*(.+?)\s*$")

def split_label_and_claim(title: str) -> Tuple[Optional[str], str]:
    title = norm_ws(title)
    m = LABEL_RE.match(title)
    if not m:
        return None, title
    label = norm_ws(m.group(1))
    claim = norm_ws(m.group(2))
    return label, claim

def extract_meta_published_time(soup: BeautifulSoup) -> Optional[str]:
    for prop in ["article:published_time", "og:published_time"]:
        m = soup.find("meta", attrs={"property": prop})
        if m and m.get("content"):
            return m["content"][:10]
    t = soup.find("time", attrs={"datetime": True})
    if t:
        return t["datetime"][:10]
    return None

def extract_tags(soup: BeautifulSoup) -> List[str]:
    tags = []
    for a in soup.select("a[href*='/eticheta/'], a[href*='/tag/']"):
        txt = norm_ws(a.get_text(" ", strip=True))
        if txt and len(txt) <= 60:
            tags.append(txt)
    out, seen = [], set()
    for t in tags:
        key = t.lower()
        if key in seen:
            continue
        seen.add(key)
        out.append(t)
    return out

def extract_sources_block(soup: BeautifulSoup) -> List[Dict[str, str]]:
    links = []
    text = soup.get_text("\n", strip=True)
    if "Verifică sursele" not in text:
        return links
    heading = None
    for tag in soup.find_all(["h2", "h3", "h4", "strong", "p"]):
        if "Verifică sursele" in (tag.get_text(" ", strip=True) or ""):
            heading = tag
            break
    if not heading:
        return links
    ul = heading.find_next(["ul", "ol"])
    if not ul:
        return links
    for a in ul.find_all("a", href=True):
        links.append({"title": norm_ws(a.get_text(" ", strip=True)), "url": a["href"].strip()})
    return links

SECTION_MARKERS = [
    "ȘTIRE:",
    "NARAȚIUNE:",
    "NARAŢIUNE:",
    "NARAȚIUNI:",
    "OBIECTIVE:",
    "OBIECTIV:",
    "DE CE ESTE FALSĂ NARAȚIUNEA:",
    "DE CE SUNT FALSE NARAȚIUNILE:",
    "CONTEXT:",
    "CONTEXT/ETOS LOCAL:",
    "Realitate:",
    "SÂMBURE DE ADEVĂR:",
]

def extract_sections(text: str) -> Dict[str, str]:
    text = text.replace("\r", "")
    text = re.sub(r"[ \t]+", " ", text)
    positions = []
    for mk in SECTION_MARKERS:
        idx = text.find(mk)
        if idx != -1:
            positions.append((idx, mk))
    if not positions:
        return {}
    positions.sort()
    out = {}
    for i, (idx, mk) in enumerate(positions):
        end = positions[i + 1][0] if i + 1 < len(positions) else len(text)
        chunk = text[idx + len(mk):end].strip()
        if len(chunk) < 5:
            continue
        key = mk.strip(":").strip()
        out[key] = norm_ws(chunk)
    return out

def parse_veridica_article(html: str, url: str, listing_row: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    soup = BeautifulSoup(html, "lxml")

    h1 = soup.find("h1")
    title = norm_ws(h1.get_text(" ", strip=True)) if h1 else None
    label, claim = split_label_and_claim(title or "")

    article = soup.find("article")
    main_text = norm_ws(article.get_text("\n", strip=True)) if article else norm_ws(soup.get_text("\n", strip=True))
    sections = extract_sections(main_text)

    rec = {
        "id": sha1(url),
        "url": url,
        "title": title,
        "label": label,
        "claim": claim,
        "published_time": extract_meta_published_time(soup),
        "tags": extract_tags(soup),
        "sources_block": extract_sources_block(soup),
        "sections": sections,
        "text": main_text,
    }

    if listing_row:
        rec.update({
            "listing_date": listing_row.get("listing_date"),
            "listing_country": listing_row.get("listing_country"),
            "listing_sources": listing_row.get("listing_sources"),
        })
    return rec

def build_dataset(df_index: pd.DataFrame, max_articles: Optional[int] = None) -> pd.DataFrame:
    sess = requests.Session()
    rows = []
    urls = df_index["url"].tolist()
    if max_articles is not None:
        urls = urls[:max_articles]

    index_map = {r["url"]: r for r in df_index.to_dict(orient="records")}

    for url in tqdm(urls, desc="Scraping articles"):
        try:
            html = fetch_html(url, sess)
            rec = parse_veridica_article(html, url, listing_row=index_map.get(url))
            rows.append(rec)
        except Exception as e:
            rows.append({"id": sha1(url), "url": url, "error": str(e)})
    return pd.DataFrame(rows)

## Build dataset

In [8]:
df = build_dataset(df_index, max_articles=None)
df.head(3)

Scraping articles:   0%|          | 0/1049 [00:00<?, ?it/s]

Unnamed: 0,id,url,title,label,claim,published_time,tags,sources_block,sections,text,listing_date,listing_country,listing_sources
0,8dcd5ed764e0f9338e8f40c898cb1f2d0998c615,https://www.veridica.ro/fake-news-dezinformare-propaganda/fake-news-europarlamentarii-romani-au-votat-pentru-inarmar...,FAKE NEWS: Europarlamentarii români au votat pentru înarmare și război,FAKE NEWS,Europarlamentarii români au votat pentru înarmare și război,2025-12-22,"[Invadarea Ucrainei, Război în Ucraina, Uniunea Europeană, înarmare, România]",[],"{'ȘTIRE': 'Lăsați orice speranță. Toți europarlamentarii noștri, înalți (ne)demni-tari, bine plătiți, îmbuibați și p...",FAKE NEWS: Europarlamentarii români au votat pentru înarmare și război © EPA/HANNIBAL HANSCHKE | A Rheinmetall Army ...,22.12.2025,România,[]
1,0d1395bd4b26313909e9d3b651b5ed617fb58d41,https://www.veridica.ro/fake-news-dezinformare-propaganda/republica-moldova-2025-top-fake-news-dezinformari-demontat...,Republica Moldova 2025: Top FAKE NEWS & DEZINFORMĂRI demontate de Veridica,,Republica Moldova 2025: Top FAKE NEWS & DEZINFORMĂRI demontate de Veridica,2025-12-22,"[Invadarea Ucrainei, Republica Moldova, Ucraina, Maia Sandu, Transnistria, alegeri, Rusia, Război în Ucraina, Români...",[],{},Republica Moldova 2025: Top FAKE NEWS & DEZINFORMĂRI demontate de Veridica © EPA/ROBERT GHEMENT | Militari din Repub...,22.12.2025,,[]
2,e8030fe71760deed43939c1d92284ecc16c6b1e2,https://www.veridica.ro/fake-news-dezinformare-propaganda/propaganda-de-razboi-ucraina-e-o-dictatura-si-nu-poate-org...,PROPAGANDĂ DE RĂZBOI: Ucraina e o dictatură și nu poate organiza alegeri libere,PROPAGANDĂ DE RĂZBOI,Ucraina e o dictatură și nu poate organiza alegeri libere,2025-12-21,"[Invadarea Ucrainei, Război în Ucraina, alegeri, Volodimir Zelenski]","[{'title': 'Zelenski l-a mințit pe Trump în privința alegerilor din Ucraina', 'url': 'https://ukraina.ru/20251216/ze...",{'NARAȚIUNI': '1. Zelenski a mințit SUA cu privire la organizarea alegerilor. 2. Ucraina este condusă de un regim di...,"PROPAGANDĂ DE RĂZBOI: Ucraina e o dictatură și nu poate organiza alegeri libere © EPA/STRINGER | Volodimir Zelenski,...",21.12.2025,Ucraina,"[{'title': 'Zelenski l-a mințit pe Trump în privința alegerilor din Ucraina', 'url': 'https://ukraina.ru/20251216/ze..."


In [9]:
raw_path = OUT_DIR / "veridica_raw.jsonl"
with raw_path.open("w", encoding="utf-8") as f:
    for rec in df.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

df_out = df.copy()
for col in ["tags", "sources_block", "listing_sources", "sections"]:
    if col in df_out.columns:
        df_out[col] = df_out[col].apply(lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, (list, dict)) else x)

csv_path = OUT_DIR / "veridica_dataset.csv"
df_out.to_csv(csv_path, index=False, encoding="utf-8")

raw_path, csv_path

(WindowsPath('data_veridica/veridica_raw.jsonl'),
 WindowsPath('data_veridica/veridica_dataset.csv'))

## Post processing

In [10]:
def normalize_label(label: Optional[str]) -> Optional[str]:
    if not label:
        return None
    label = label.upper().replace("Ţ", "Ț").replace("Ş", "Ș")
    label = norm_ws(label)
    if label.startswith("FAKE NEWS"):
        return "FAKE_NEWS"
    if label.startswith("DEZINFORMARE"):
        return "DEZINFORMARE"
    if label.startswith("PROPAGANDĂ DE RĂZBOI") or label.startswith("PROPAGANDA DE RAZBOI"):
        return "PROPAGANDA_DE_RAZBOI"
    return label

df_pp = df.copy()
df_pp["label_norm"] = df_pp["label"].apply(normalize_label) if "label" in df_pp.columns else None
df_pp["text_len"] = df_pp["text"].fillna("").apply(len) if "text" in df_pp.columns else 0

if "error" in df_pp.columns:
    df_pp = df_pp[df_pp["error"].isna()].copy()
df_pp = df_pp[df_pp["text_len"] >= 300].copy()
df_pp = df_pp.drop_duplicates(subset=["url"]).reset_index(drop=True)

df_pp[["label_norm", "listing_country"]].value_counts().head(20)

label_norm            listing_country           
FAKE_NEWS             România                       189
                      Moldova                       158
PROPAGANDA_DE_RAZBOI  Rusia                         136
                      Ucraina                        67
DEZINFORMARE          Moldova                        53
FAKE_NEWS             Ucraina                        44
DEZINFORMARE          România                        35
                      Ucraina                        20
                      Rusia                          12
FAKE_NEWS             Rusia                          11
PROPAGANDA_DE_RAZBOI  România                         4
FAKE-NEWS             România                         2
DEZINFORMARE          Europe Union                    1
PROPAGANDA_DE_RAZBOI  Polonia                         1
                      Germany                         1
                      France                          1
FAKE_NEWS             Marea Britanie                  1

In [11]:
print("Index rows:", len(df_index))
print("Scraped rows:", len(df))
print("Postprocessed rows:", len(df_pp))
df_pp["label_norm"].value_counts(dropna=False).head(20)

Index rows: 1049
Scraped rows: 1049
Postprocessed rows: 1049


label_norm
FAKE_NEWS               508
PROPAGANDA_DE_RAZBOI    307
DEZINFORMARE            206
None                     24
FAKE-NEWS                 2
PROPAGANDĂ                2
Name: count, dtype: int64