## Imports

In [20]:
import json
import math
import hashlib
import datetime
from pathlib import Path
from urllib.parse import urlparse
from typing import Dict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score, roc_curve, classification_report

## Setup

In [21]:
PROJECT_ROOT = Path(".")
DATASETS_ROOT = Path("../dataset-creation")

PATHS = {
    "afp": DATASETS_ROOT / "AFP" / "out_afp_verificat" / "afp_verificat_dataset.csv",
    "factual": DATASETS_ROOT / "Factual" / "data" / "factual_ro_dataset_postprocessed.csv",
    "veridica": DATASETS_ROOT / "Veridica" / "data_veridica" / "veridica_dataset.csv",
    "ocr": DATASETS_ROOT / "Pseudo-FakeRom" / "ocr_fake_news_dataset.csv",
    "tnr": DATASETS_ROOT / "TNR" / "out_tnr" / "tnr_satire_dataset.csv",
}

for k, p in PATHS.items():
    print(f"{k:8s} -> exists={p.exists()}  path={p}")

OUT_DIR = Path("out_source_prior")
OUT_DIR.mkdir(exist_ok=True)
print("OUT_DIR:", OUT_DIR.resolve())

afp      -> exists=True  path=..\dataset-creation\AFP\out_afp_verificat\afp_verificat_dataset.csv
factual  -> exists=True  path=..\dataset-creation\Factual\data\factual_ro_dataset_postprocessed.csv
veridica -> exists=True  path=..\dataset-creation\Veridica\data_veridica\veridica_dataset.csv
ocr      -> exists=True  path=..\dataset-creation\Pseudo-FakeRom\ocr_fake_news_dataset.csv
tnr      -> exists=True  path=..\dataset-creation\TNR\out_tnr\tnr_satire_dataset.csv
OUT_DIR: D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\source_veracity\out_source_prior


## Utils

In [22]:
def normalize_ws(s: str) -> str:
    return " ".join((s or "").split())

def safe_str(x) -> str:
    import numpy as _np
    return "" if x is None or (isinstance(x, float) and _np.isnan(x)) else str(x)

def normalize_label(s: str) -> str:
    return normalize_ws(s).upper()

def md5(s: str) -> str:
    return hashlib.md5((s or "").encode("utf-8")).hexdigest()

def get_domain(u: str) -> str:
    u = safe_str(u).strip()
    if not u:
        return ""
    try:
        d = urlparse(u).netloc.lower()
        return d.replace("www.", "")
    except Exception:
        return ""

In [23]:
TRUE_SET_RO = {"ADEVĂRAT", "ADEVARAT", "PARȚIAL ADEVĂRAT", "PARTIAL ADEVARAT", "PARTIAL ADEVĂRAT", "REAL", "TRUE"}
FALSE_SET_RO = {
    "FALS", "TRUNCHIAT", "ÎNȘELĂTOR", "INȘELĂTOR", "INSELATOR", "CONTEXT LIPSĂ", "CONTEXT LIPSA",
    "LIPSA CONTEXTULUI", "FOTOGRAFIE ALTERATĂ", "FOTOGRAFIE ALTERATA",
    "VIDEOCLIP ALTERAT", "VIDEO ALTERAT", "DEEPFAKE", "SATIRĂ", "SATIRA", "SATIRE", "FARSĂ", "FARSA",
    "FAKE", "FALSE", "FAKE NEWS", "DEZINFORMARE", "FABRICATED", "PROPAGANDA", "PROPAGANDĂ", "PROPAGANDĂ DE RĂZBOI"
}
UNVERIFIABLE_SET_RO = {"IMPOSIBIL DE VERIFICAT", "S-A RĂZGÂNDIT", "S-A RAZGANDIT", "PLAUSIBLE"}

def map_label_binary(label_fine: str):
    L = normalize_label(label_fine)
    if L in TRUE_SET_RO:
        return 1
    if L in FALSE_SET_RO:
        return 0
    if L in UNVERIFIABLE_SET_RO:
        return None
    return None

In [24]:
def load_afp(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "afp"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = df.get("url", "").fillna("").apply(safe_str)
    out["source_url"] = df.get("source_url", "").fillna("").apply(safe_str)
    out["source_domain"] = out["source_url"].apply(get_domain)
    out["label_fine"] = df.get("label_norm", df.get("label", "")).fillna("").apply(safe_str)
    out["y"] = out["label_fine"].apply(map_label_binary)
    out["text"] = (df.get("title","").fillna("") + " [SEP] " + df.get("claim","").fillna("")).apply(safe_str).apply(normalize_ws)
    return out

def load_factual(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "factual"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = df.get("url", "").fillna("").apply(safe_str)
    out["source_url"] = df.get("source", df.get("speaker_url", "")).fillna("").apply(safe_str)
    out["source_domain"] = out["source_url"].apply(get_domain)
    out["label_fine"] = df.get("label", "").fillna("").apply(safe_str)
    out["y"] = out["label_fine"].apply(map_label_binary)
    out["text"] = df.get("text", "").fillna("").apply(safe_str).apply(normalize_ws)
    return out

def load_veridica(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "veridica"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = df.get("url", "").fillna("").apply(safe_str)
    # veridica dataset is itself a fact-check site, so publisher is veridica.ro; but we exclude factcheck domains anyway
    out["source_url"] = out["url"]
    out["source_domain"] = out["url"].apply(get_domain)
    out["label_fine"] = df.get("label", "").fillna("").apply(safe_str)
    out["y"] = 0  # your assumption: all fake (keep consistent with your pipeline)
    out["text"] = df.get("text", "").fillna("").apply(safe_str).apply(normalize_ws)
    return out

def load_ocr(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "ocr"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = ""
    out["source_url"] = ""
    out["source_domain"] = ""  # unknown publisher
    out["label_fine"] = df.get("label", "").fillna("").apply(safe_str)
    if "label_group" in df.columns:
        lg = df["label_group"].fillna("").apply(safe_str).str.upper()
        out["y"] = lg.map({"REAL": 1, "TRUE": 1, "FAKE": 0, "FALSE": 0})
    else:
        out["y"] = out["label_fine"].apply(map_label_binary)
    out["text"] = df.get("text", "").fillna("").apply(safe_str).apply(normalize_ws)
    return out

def load_tnr(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "tnr"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = df.get("url", "").fillna("").apply(safe_str)
    out["source_url"] = out["url"]
    out["source_domain"] = out["url"].apply(get_domain)
    out["label_fine"] = df.get("label", "SATIRE").fillna("").apply(safe_str)
    out["y"] = out["label_fine"].apply(map_label_binary)  # SATIRE -> 0 via FALSE_SET_RO
    out["text"] = df.get("text", "").fillna("").apply(safe_str).apply(normalize_ws)
    return out

## Load datasets

In [25]:
dfs = []
if PATHS["afp"].exists(): dfs.append(load_afp(PATHS["afp"]))
if PATHS["factual"].exists(): dfs.append(load_factual(PATHS["factual"]))
if PATHS["veridica"].exists(): dfs.append(load_veridica(PATHS["veridica"]))
if PATHS["ocr"].exists(): dfs.append(load_ocr(PATHS["ocr"]))
if PATHS["tnr"].exists(): dfs.append(load_tnr(PATHS["tnr"]))

data = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print("Unified rows:", len(data))
display(data["dataset"].value_counts(dropna=False))
display(data.head(3))

data["source_domain"] = data["source_domain"].fillna("").astype(str).str.lower()
data["has_domain"] = data["source_domain"].str.len() > 0

Unified rows: 2125


dataset
NaN    2125
Name: count, dtype: int64

Unnamed: 0,dataset,id,url,source_url,source_domain,label_fine,y,text
0,,ec5256cca1a28dddab1e6bcea06d7698042da8cb,https://verificat.afp.com/doc.afp.com.32AD84J,https://www.facebook.com/mariana.muntean/posts...,facebook.com,CONTEXT LIPSĂ,0.0,Această înregistrare video nu este o dovadă că...
1,,4010817634d8d58df5de9d932abace7d2a8ec2f5,https://verificat.afp.com/doc.afp.com.32BY897,https://www.facebook.com/Lupul.Dacic.blog/post...,facebook.com,CONTEXT LIPSĂ,0.0,Focarele de variola maimuței nu sunt legate de...
2,,d11422ae41ff6c23d485c27ccc49b8b1a8e55d88,https://verificat.afp.com/doc.afp.com.32CX3EH,https://www.facebook.com/permalink.php?story_f...,facebook.com,SATIRĂ,0.0,Videoclipul care arată o „păpușă Ken însărcina...


## Choose domains to trust as source prior

In [26]:
PLATFORM_DOMAINS = {
    "facebook.com", "m.facebook.com",
    "tiktok.com",
    "youtube.com", "youtu.be",
    "twitter.com", "x.com",
    "instagram.com",
    "reddit.com",
    "telegram.org", "t.me",
}

FACTCHECK_DOMAINS = {
    "factual.ro",
    "verificat.afp.com", "factcheck.afp.com", "factuel.afp.com",
    "veridica.ro",
}

In [27]:
eligible = data[
    (data["y"].isin([0,1])) &
    (data["has_domain"]) &
    (~data["source_domain"].isin(FACTCHECK_DOMAINS)) &
    (~data["source_domain"].isin(PLATFORM_DOMAINS))
].copy()

print("Rows eligible for INTERNAL publisher prior:", len(eligible))
display(eligible["source_domain"].value_counts().head(30))

Rows eligible for INTERNAL publisher prior: 32


source_domain
timesnewroman.ro    30
stiripesurse.ro      1
activenews.ro        1
Name: count, dtype: int64

## Compute prior

In [28]:
def safe_logit(p: float, eps: float = 1e-6) -> float:
    p = min(max(p, eps), 1 - eps)
    return math.log(p / (1 - p))

def inv_logit(z: float) -> float:
    return 1 / (1 + math.exp(-z))

def compute_internal_publisher_prior(df: pd.DataFrame, prior_p: float = 0.5, prior_strength: float = 10.0) -> pd.DataFrame:
    g = df.groupby("source_domain")["y"]
    stats = g.agg(n="count", n_true="sum").reset_index()
    stats["n_false"] = stats["n"] - stats["n_true"]

    stats["p_true_smooth"] = (stats["n_true"] + prior_p * prior_strength) / (stats["n"] + prior_strength)
    stats["logit_p_true"] = stats["p_true_smooth"].apply(safe_logit)

    stats["strength"] = np.sqrt(stats["n"] / (stats["n"] + prior_strength))
    stats["source_score_internal"] = stats["logit_p_true"] * stats["strength"]
    stats["p_true_internal"] = stats["source_score_internal"].apply(inv_logit)

    return stats.sort_values("n", ascending=False)

internal_tbl = compute_internal_publisher_prior(eligible, prior_p=0.5, prior_strength=10.0)
display(internal_tbl.head(30))

Unnamed: 0,source_domain,n,n_true,n_false,p_true_smooth,logit_p_true,strength,source_score_internal,p_true_internal
2,timesnewroman.ro,30,0.0,30.0,0.125,-1.94591,0.866025,-1.685208,0.156407
0,activenews.ro,1,0.0,1.0,0.454545,-0.182322,0.301511,-0.054972,0.48626
1,stiripesurse.ro,1,0.0,1.0,0.454545,-0.182322,0.301511,-0.054972,0.48626


In [29]:
eligible["text_hash"] = eligible["text"].fillna("").apply(lambda s: md5(s.lower()[:2000]))

gss = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
tr_idx, te_idx = next(gss.split(eligible, groups=eligible["text_hash"]))
train_prior = eligible.iloc[tr_idx].copy()
test_prior = eligible.iloc[te_idx].copy()

train_table = compute_internal_publisher_prior(train_prior, prior_p=0.5, prior_strength=10.0).set_index("source_domain")

def score_for_domain(domain: str) -> float:
    if not domain:
        return 0.0
    if domain in train_table.index:
        return float(train_table.loc[domain, "source_score_internal"])
    return 0.0

test_prior["source_score"] = test_prior["source_domain"].apply(score_for_domain)
test_prior["p_true_src"] = test_prior["source_score"].apply(inv_logit)

y_true = test_prior["y"].astype(int).values
p_true = test_prior["p_true_src"].values

auc = roc_auc_score(y_true, p_true) if len(np.unique(y_true)) > 1 else float("nan")
print("AUC (publisher prior only):", auc)

y_pred = (p_true >= 0.5).astype(int)
print(classification_report(y_true, y_pred, target_names=["FALSE","TRUE"]))

fpr, tpr, _ = roc_curve(y_true, p_true)
plt.figure()
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC — Publisher prior only")
plt.show()

AUC (publisher prior only): nan


ValueError: Number of classes, 1, does not match size of target_names, 2. Try specifying the labels parameter

## Extend with MBFC

In [30]:
MBFC_RAW_URL = "https://raw.githubusercontent.com/osome-iu/ChatGPT_domain_rating/main/data/mbfc_ratings.csv"

mbfc = pd.read_csv(MBFC_RAW_URL)

domain_col = None
for cand in ["domain", "source", "url", "website"]:
    if cand in mbfc.columns:
        domain_col = cand
        break
if domain_col is None:
    raise RuntimeError(f"Cannot find domain column in MBFC file. Columns: {list(mbfc.columns)}")

mbfc["source_domain"] = mbfc[domain_col].astype(str).apply(lambda x: get_domain(x) if "://" in x else x.lower().replace("www.",""))
mbfc["source_domain"] = mbfc["source_domain"].str.strip()

factual_col = None
for cand in ["factual_reporting", "factuality", "factual", "factual reporting"]:
    if cand in mbfc.columns:
        factual_col = cand
        break
if factual_col is None:
    raise RuntimeError(f"Cannot find factuality column in MBFC file. Columns: {list(mbfc.columns)}")

mbfc["mbfc_factuality"] = mbfc[factual_col].astype(str).str.strip().str.lower()

FACT2P = {
    "very high": 0.92,
    "high": 0.85,
    "mostly factual": 0.75,
    "mixed": 0.55,
    "low": 0.25,
    "very low": 0.12,
    "unknown": 0.50,
    "na": 0.50,
    "n/a": 0.50,
}

def mbfc_to_p_true(s: str) -> float:
    s = (s or "").strip().lower()
    if s in FACT2P:
        return FACT2P[s]
    for k, v in FACT2P.items():
        if k in s:
            return v
    return 0.50

mbfc["p_true_mbfc"] = mbfc["mbfc_factuality"].apply(mbfc_to_p_true)
mbfc["source_score_mbfc_raw"] = mbfc["p_true_mbfc"].apply(safe_logit)

mbfc_small = mbfc[["source_domain","mbfc_factuality","p_true_mbfc","source_score_mbfc_raw"]].drop_duplicates("source_domain")
display(mbfc_small.head(10))

Unnamed: 0,source_domain,mbfc_factuality,p_true_mbfc,source_score_mbfc_raw
0,100milefreepress.net,high,0.85,1.734601
1,604now.com,mostly factual,0.75,1.098612
2,972mag.com,high,0.85,1.734601
3,whitehouse.gov,mostly factual,0.75,1.098612
4,abcnews.go.com,high,0.85,1.734601
5,abc.net.au,high,0.85,1.734601
6,abc11.com,high,0.85,1.734601
7,abbynews.com,high,0.85,1.734601
8,abovethelaw.com,high,0.85,1.734601
9,aceshowbiz.com,mixed,0.55,0.200671


## Build final table

In [31]:
MIN_INTERNAL_N = 5

internal_small = internal_tbl[[
    "source_domain","n","n_true","n_false","p_true_smooth","logit_p_true","strength","source_score_internal","p_true_internal"
]].copy()

final = pd.merge(internal_small, mbfc_small, on="source_domain", how="outer")

# compute n_total even if missing
final["n_total"] = final.get("n", 0).fillna(0)

final["is_platform"] = final["source_domain"].isin(PLATFORM_DOMAINS)
final["is_factcheck"] = final["source_domain"].isin(FACTCHECK_DOMAINS)

# decide source_score_final
final["source_score_final"] = 0.0
final["evidence"] = "neutral"

mask_internal = (~final["is_platform"]) & (~final["is_factcheck"]) & (final["n_total"] >= MIN_INTERNAL_N) & final["source_score_internal"].notna()
mask_mbfc = (~final["is_platform"]) & (~final["is_factcheck"]) & (~mask_internal) & final["source_score_mbfc_raw"].notna()

final.loc[mask_internal, "source_score_final"] = final.loc[mask_internal, "source_score_internal"]
final.loc[mask_internal, "evidence"] = "internal"

final.loc[mask_mbfc, "source_score_final"] = final.loc[mask_mbfc, "source_score_mbfc_raw"]
final.loc[mask_mbfc, "evidence"] = "mbfc"

# platform-neutral overrides everything
final.loc[final["is_platform"], "source_score_final"] = 0.0
final.loc[final["is_platform"], "evidence"] = "platform-neutral"

# factcheck-neutral (also safe)
final.loc[final["is_factcheck"], "source_score_final"] = 0.0
final.loc[final["is_factcheck"], "evidence"] = "factcheck-neutral"

final["p_true_final"] = final["source_score_final"].apply(inv_logit)
final["updated_at"] = datetime.datetime.utcnow().strftime("%Y-%m-%d")

out_cols = [
    "source_domain",
    "p_true_final",
    "source_score_final",
    "evidence",
    "n_total",
    "n_true",
    "n_false",
    "mbfc_factuality",
    "updated_at",
]
for c in out_cols:
    if c not in final.columns:
        final[c] = np.nan

final_table = final[out_cols].sort_values(["evidence","n_total","source_domain"], ascending=[True, False, True])
display(final_table.head(30))

Unnamed: 0,source_domain,p_true_final,source_score_final,evidence,n_total,n_true,n_false,mbfc_factuality,updated_at
3735,timesnewroman.ro,0.156407,-1.685208,internal,30.0,0.0,30.0,,2025-12-26
0,,0.55,0.200671,mbfc,0.0,,,mixed,2025-12-26
1,100milefreepress.net,0.85,1.734601,mbfc,0.0,,,high,2025-12-26
2,100percentfedup.com,0.25,-1.098612,mbfc,0.0,,,low,2025-12-26
3,1011now.com,0.85,1.734601,mbfc,0.0,,,high,2025-12-26
4,10tv.com,0.85,1.734601,mbfc,0.0,,,high,2025-12-26
5,11alive.com,0.85,1.734601,mbfc,0.0,,,high,2025-12-26
6,12minutos.com,0.25,-1.098612,mbfc,0.0,,,low,2025-12-26
7,12news.com,0.85,1.734601,mbfc,0.0,,,high,2025-12-26
8,12newsnow.com,0.85,1.734601,mbfc,0.0,,,high,2025-12-26


In [32]:
prior_csv = OUT_DIR / "source_veracity_table.csv"
prior_json = OUT_DIR / "source_veracity_table.json"

final_table.to_csv(prior_csv, index=False, encoding="utf-8")
with prior_json.open("w", encoding="utf-8") as f:
    json.dump(final_table.to_dict(orient="records"), f, ensure_ascii=False, indent=2)

print("Saved:", prior_csv.resolve())
print("Saved:", prior_json.resolve())

Saved: D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\source_veracity\out_source_prior\source_veracity_table.csv
Saved: D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\source_veracity\out_source_prior\source_veracity_table.json


## Example usage

In [33]:
def load_prior_table(path: Path) -> Dict[str, float]:
    tbl = pd.read_csv(path, encoding="utf-8")
    return {row["source_domain"]: float(row["source_score_final"]) for _, row in tbl.iterrows()}

PRIOR = load_prior_table(prior_csv)

def get_source_score_from_url(source_url: str) -> float:
    d = get_domain(source_url)
    return float(PRIOR.get(d, 0.0))

examples = [
    "https://www.timesnewroman.ro/7lucruri/7-lucruri-foarte-bune-despre-incalzirea-globala/",
    "https://www.facebook.com/somepage/posts/123",
    "https://www.digi24.ro/stiri/actualitate/whatever",
    "https://unknown.example/whatever",
]
for u in examples:
    print(u, "=>", get_source_score_from_url(u))


https://www.timesnewroman.ro/7lucruri/7-lucruri-foarte-bune-despre-incalzirea-globala/ => -1.685207622563865
https://www.facebook.com/somepage/posts/123 => 0.0
https://www.digi24.ro/stiri/actualitate/whatever => 1.7346010553881064
https://unknown.example/whatever => 0.0
