## Imports

In [7]:
import re
import json
import time
import hashlib
from pathlib import Path
from typing import List, Dict
from urllib.parse import urljoin, urlparse

import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

## Setup

In [8]:
SECTIONS = {
    "it-stiinta": "https://www.timesnewroman.ro/it-stiinta/",
    "life-death": "https://www.timesnewroman.ro/life-death/",
    "monden": "https://www.timesnewroman.ro/monden/",
    "sport": "https://www.timesnewroman.ro/sport/",
    "politic": "https://www.timesnewroman.ro/politic/",
    "7lucruri": "https://www.timesnewroman.ro/7lucruri/",
}

N_PER_SECTION = 5
MAX_PAGES_PER_SECTION = 60
SLEEP_BETWEEN_REQUESTS = 0.5

OUT_DIR = Path("out_tnr")
OUT_DIR.mkdir(exist_ok=True)

BASE = "https://www.timesnewroman.ro"
IMG_EXT_RE = re.compile(r"\.(jpg|jpeg|png|webp|gif)(\?|#|$)", re.IGNORECASE)

print("Output folder:", OUT_DIR.resolve())

Output folder: D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\dataset-creation\TNR\out_tnr


## Utils

In [9]:
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123 Safari/537.36",
    "Accept-Language": "ro-RO,ro;q=0.9,en-US;q=0.8,en;q=0.7",
})

def fetch(url: str, retries: int = 4, timeout: int = 30) -> str:
    last_err = None
    for i in range(retries):
        try:
            r = session.get(url, timeout=timeout)
            r.raise_for_status()
            return r.text
        except Exception as e:
            last_err = e
            time.sleep(0.8 * (i + 1))
    raise RuntimeError(f"Failed to fetch {url} after {retries} retries. Last error: {last_err}")

def extract_article_urls_from_section(html: str) -> List[str]:
    soup = BeautifulSoup(html, "lxml")
    urls: List[str] = []

    for a in soup.select("h2 a, h3 a, a[rel='bookmark']"):
        href = (a.get("href") or "").strip()
        if not href:
            continue
        href = urljoin(BASE, href)
        if not href.startswith(BASE):
            continue
        if IMG_EXT_RE.search(href):
            continue
        urls.append(href.split("#")[0])

    if not urls:
        for a in soup.find_all("a", href=True):
            href = urljoin(BASE, a["href"].strip())
            if not href.startswith(BASE):
                continue
            if IMG_EXT_RE.search(href):
                continue
            if any(f"/{sec}/" in href for sec in SECTIONS.keys()):
                urls.append(href.split("#")[0])

    return list(dict.fromkeys(urls))

def discover_section_urls(section_url: str, max_pages: int = 60) -> List[str]:
    found: List[str] = []
    for page in range(1, max_pages + 1):
        url = section_url if page == 1 else section_url.rstrip("/") + f"/page/{page}/"
        try:
            html = fetch(url)
        except Exception as e:
            print(f"[STOP] listing fetch failed: {url} -> {e}")
            break

        urls = extract_article_urls_from_section(html)
        if not urls:
            break

        before = len(found)
        found.extend(urls)
        found = list(dict.fromkeys(found))

        if len(found) == before:
            break

        time.sleep(SLEEP_BETWEEN_REQUESTS)

    return found

In [10]:
test_list = discover_section_urls(SECTIONS["it-stiinta"], max_pages=3)
len(test_list), test_list[:5]

(47,
 ['https://www.timesnewroman.ro/7lucruri/',
  'https://www.timesnewroman.ro/politic/',
  'https://www.timesnewroman.ro/sport/',
  'https://www.timesnewroman.ro/monden/',
  'https://www.timesnewroman.ro/life-death/'])

## Parse articles

In [11]:
def sha1(s: str) -> str:
    return hashlib.sha1((s or "").encode("utf-8")).hexdigest()

def md5(s: str) -> str:
    return hashlib.md5((s or "").encode("utf-8")).hexdigest()

def clean_text_blocks(container: BeautifulSoup) -> str:
    for tag in container.select("script, style, noscript, iframe"):
        tag.decompose()

    for tag in container.select("[id^='adocean'], .ads, .ad, .advert, .advertisement"):
        tag.decompose()

    paras = []
    for p in container.find_all(["p", "h2", "h3", "li"]):
        txt = p.get_text(" ", strip=True)
        if not txt:
            continue
        if len(txt) < 12 and not re.search(r"[A-Za-zĂÂÎȘȚăâîșț]", txt):
            continue
        if txt.strip() in {"", " "}:
            continue
        paras.append(txt)

    return "\n\n".join(paras).strip()

def parse_tnr_article(html: str, url: str, section: str) -> Dict:
    soup = BeautifulSoup(html, "lxml")

    title = ""
    h1 = soup.find("h1")
    if h1:
        title = h1.get_text(" ", strip=True)

    published_time = ""
    time_tag = soup.find("time")
    if time_tag and (time_tag.get("datetime") or time_tag.get_text(strip=True)):
        published_time = time_tag.get("datetime") or time_tag.get_text(" ", strip=True)

    if not published_time:
        meta_time = soup.find("meta", attrs={"property": "article:published_time"})
        if meta_time and meta_time.get("content"):
            published_time = meta_time["content"].strip()

    author = ""
    meta_author = soup.find("meta", attrs={"name": "author"})
    if meta_author and meta_author.get("content"):
        author = meta_author["content"].strip()

    content = soup.select_one("div.content-container.page-editor-content")
    if not content:
        content = soup.find("article")

    if not content:
        raise ValueError("No content container found")

    premium_block = content.select_one("div.join-premium-container")
    is_premium = premium_block is not None

    text = clean_text_blocks(content)

    return {
        "id": sha1(url),
        "url": url,
        "source": "timesnewroman.ro",
        "section": section,
        "title": title,
        "author": author,
        "published_time": published_time,
        "is_premium": bool(is_premium),
        "text": text,
        "text_len": len(text),
        "label": "SATIRE",
        "label_group": "FALSE",
        "y": 0,
    }

def collect_free_articles_for_section(section_name: str, section_url: str,
                                      n_target: int = 5,
                                      max_pages: int = 60,
                                      min_words: int = 200,
                                      min_paras: int = 4) -> List[Dict]:
    listing_urls = discover_section_urls(section_url, max_pages=max_pages)

    kept: List[Dict] = []
    seen = set()

    for art_url in listing_urls:
        if len(kept) >= n_target:
            break
        if art_url in seen:
            continue
        seen.add(art_url)

        try:
            html = fetch(art_url)
            rec = parse_tnr_article(html, art_url, section_name)

            if rec["is_premium"]:
                continue

            wc = len(rec["text"].split())
            pc = rec["text"].count("\n\n") + 1 if rec["text"] else 0
            if wc < min_words or pc < min_paras:
                continue

            kept.append(rec)
        except Exception:
            pass

        time.sleep(SLEEP_BETWEEN_REQUESTS)

    return kept

In [12]:
all_rows = []
for sec_name, sec_url in SECTIONS.items():
    rows = collect_free_articles_for_section(sec_name, sec_url, n_target=N_PER_SECTION, max_pages=MAX_PAGES_PER_SECTION)
    print(f"[OK] section={sec_name} kept={len(rows)}/{N_PER_SECTION}")
    all_rows.extend(rows)

print("Total kept:", len(all_rows))
pd.DataFrame(all_rows)[["section","title","published_time","text_len","is_premium"]].head(10)

[OK] section=it-stiinta kept=5/5
[OK] section=life-death kept=5/5
[OK] section=monden kept=5/5
[OK] section=sport kept=5/5
[OK] section=politic kept=5/5
[OK] section=7lucruri kept=5/5
Total kept: 30


Unnamed: 0,section,title,published_time,text_len,is_premium
0,it-stiinta,28.000 de angajați TVR au mușcat-o și au dat b...,,1402,False
1,it-stiinta,Român verificat de ANAF. De unde are bani să r...,,1163,False
2,it-stiinta,ChatGPT își face de cap în redacția G4Media! A...,,2392,False
3,it-stiinta,(P) De ce tot mai mulți specialiști străini al...,,7841,False
4,it-stiinta,"Profesorul Abuzel Rohipnoleanu de la SNSPA, ac...",,1376,False
5,life-death,Armata anunţă că a doborât o dronă trasă de re...,,1312,False
6,life-death,MApN a trimis 100.000 de invitații la nuntă. C...,,1193,False
7,life-death,"O femeie poartă căciulă tot timpul, ca să nu m...",,1337,False
8,life-death,Erou! S-a aruncat în aparatul de reciclare să ...,,1301,False
9,life-death,Încep lucrările la şantierul de mutare a Casei...,,1238,False


## Save dataset

In [13]:
df = pd.DataFrame(all_rows)

if len(df) > 0:
    df = df.drop_duplicates(subset=["url"]).copy()
    df["text_hash"] = df["text"].fillna("").apply(lambda s: md5(s.lower()[:2000]))
    df = df.drop_duplicates(subset=["text_hash"]).drop(columns=["text_hash"])
    df = df.sort_values(["section", "published_time", "title"], na_position="last")

raw_path = OUT_DIR / "tnr_satire_raw.jsonl"
csv_path = OUT_DIR / "tnr_satire_dataset.csv"

with raw_path.open("w", encoding="utf-8") as f:
    for rec in df.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

df.to_csv(csv_path, index=False, encoding="utf-8")

print("Saved JSONL:", raw_path.resolve())
print("Saved CSV  :", csv_path.resolve())
display(df.head(20))

Saved JSONL: D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\dataset-creation\TNR\out_tnr\tnr_satire_raw.jsonl
Saved CSV  : D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\dataset-creation\TNR\out_tnr\tnr_satire_dataset.csv


Unnamed: 0,id,url,source,section,title,author,published_time,is_premium,text,text_len,label,label_group,y
27,028458200cfc41db7928cc7f9e92fe5eac9a072f,https://www.timesnewroman.ro/7lucruri/7-lucrur...,timesnewroman.ro,7lucruri,7 lucruri foarte bune despre încălzirea globală,Arnia Software,,False,„Oamenii de știință” susțin că metanul de la b...,1203,SATIRE,False,0
25,633853aab3c3f7a5ca0164f3a3dcfbbf4c1e3e4b,https://www.timesnewroman.ro/7lucruri/7-meseri...,timesnewroman.ro,7lucruri,7 meserii care vor exista și în viitor,Arnia Software,,False,Instalatorul spațial: care va ști să pună câlț...,1196,SATIRE,False,0
28,a17da490db46b7d756e647df4e51eec0e230d6aa,https://www.timesnewroman.ro/7lucruri/cele-7-f...,timesnewroman.ro,7lucruri,Cele 7 faze prin care trece un român după ce i...,Arnia Software,,False,Șocul inițial: Primul impuls este unul de surp...,1726,SATIRE,False,0
26,4e81d0c03e20a7668aa39a18752f395eaa92ec67,https://www.timesnewroman.ro/7lucruri/top-7-zo...,timesnewroman.ro,7lucruri,Top 7 zodii care stau cel mai mult la cozi la ...,Arnia Software,,False,Berbecii sunt cunoscuți pentru răbdarea lor sc...,1749,SATIRE,False,0
29,c465f3ab6cc2521a8e720c9cc04b3b8678aa1389,https://www.timesnewroman.ro/7lucruri/urmatoar...,timesnewroman.ro,7lucruri,Următoarele 7 filme pe care le va lansa Mircea...,Arnia Software,,False,Botezul Focului. Mircea vrea să dea lovitura l...,2968,SATIRE,False,0
3,22f02a41907575127a10eba96c47574be4489a86,https://www.timesnewroman.ro/it-stiinta/p-de-c...,timesnewroman.ro,it-stiinta,(P) De ce tot mai mulți specialiști străini al...,Arnia Software,,False,"În 2024, Cluj-Napoca găzduiește peste 20.000 d...",7841,SATIRE,False,0
0,a582e3413b26fc157707c19e4a5ce5b02b48f1ad,https://www.timesnewroman.ro/it-stiinta/28-000...,timesnewroman.ro,it-stiinta,28.000 de angajați TVR au mușcat-o și au dat b...,Arnia Software,,False,TVR a fost zguduit de un atac cibernetic fără ...,1402,SATIRE,False,0
2,243d425efcdc13f35cd838ab278db1606cfb26db,https://www.timesnewroman.ro/it-stiinta/chatgp...,timesnewroman.ro,it-stiinta,ChatGPT își face de cap în redacția G4Media! A...,Arnia Software,,False,"Panică în redacția G4Media (pardon, G3Media, c...",2392,SATIRE,False,0
4,72ee5cb9f81519a22fe6aeb1bca18daa733e53ea,https://www.timesnewroman.ro/it-stiinta/profes...,timesnewroman.ro,it-stiinta,"Profesorul Abuzel Rohipnoleanu de la SNSPA, ac...",Arnia Software,,False,Noi acuzații grave la adresa unui cadru didact...,1376,SATIRE,False,0
1,9f6b3446aac854338c5ecbc2b8f420364e0e99b0,https://www.timesnewroman.ro/it-stiinta/roman-...,timesnewroman.ro,it-stiinta,Român verificat de ANAF. De unde are bani să r...,Arnia Software,,False,Descindere ANAF la domiciliul unui român bănui...,1163,SATIRE,False,0


In [14]:
if len(df) == 0:
    print("No rows collected. Increase MAX_PAGES_PER_SECTION or lower min_words/min_paras.")
else:
    print("Rows:", len(df))
    display(df["section"].value_counts())
    display(df[["text_len"]].describe())

    for sec in df["section"].unique():
        row = df[df["section"] == sec].iloc[0]
        print("\n" + "="*100)
        print("SECTION:", sec)
        print("TITLE  :", row.get("title","")[:120])
        print("URL    :", row.get("url",""))
        print("LEN    :", row.get("text_len",0))
        print(row.get("text","")[:600], "...")

Rows: 30


section
7lucruri      5
it-stiinta    5
life-death    5
monden        5
politic       5
sport         5
Name: count, dtype: int64

Unnamed: 0,text_len
count,30.0
mean,1683.033333
std,1224.632992
min,1087.0
25%,1238.25
50%,1385.5
75%,1555.5
max,7841.0



SECTION: 7lucruri
TITLE  : 7 lucruri foarte bune despre încălzirea globală
URL    : https://www.timesnewroman.ro/7lucruri/7-lucruri-foarte-bune-despre-incalzirea-globala/
LEN    : 1203
„Oamenii de știință” susțin că metanul de la bășinile vacilor generează o parte din această încălzire, vestea bună fiind că vaca acum nu ține doar de foame, ci și de cald;

Se topește gheața de la Polul Sud, ceea ce înseamnă că vom putea cultiva porumb și crește vaci acolo;

Cică gazele cu efect de seră sunt la cel mai înalt nivel din ultimii 4,5 milioane de ani, adică de pe vremea când existau mastodonți și mamuți, care și ăia sunt buni de mâncat;

Dacă se topește gheața și crește nivelul mării vor fi distruse doar zonele de coastă, unde nu sunt foarte multe turme de vaci foarte gustoase;

Pe ...

SECTION: it-stiinta
TITLE  : (P) De ce tot mai mulți specialiști străini aleg Clujul pentru a lucra în sectorul IT
URL    : https://www.timesnewroman.ro/it-stiinta/p-de-ce-tot-mai-multi-specialisti-straini-al