## Setup

In [9]:
from pathlib import Path
import json, math, hashlib
import numpy as np
import pandas as pd

PROJECT_ROOT = Path(".")
DATASETS_ROOT = Path("../dataset-creation")

ARTIFACTS_DIR = PROJECT_ROOT / "artifacts"
OUT_FUSION_DIR = ARTIFACTS_DIR / "fusion"
OUT_FUSION_DIR.mkdir(parents=True, exist_ok=True)

PATHS = {
    "afp": DATASETS_ROOT / "AFP" / "out_afp_verificat" / "afp_verificat_dataset.csv",
    "factual": DATASETS_ROOT / "Factual" / "data" / "factual_ro_dataset_postprocessed.csv",
    "veridica": DATASETS_ROOT / "Veridica" / "data_veridica" / "veridica_dataset.csv",
    "ocr": DATASETS_ROOT / "Pseudo-FakeRom" / "ocr_fake_news_dataset.csv",
    "tnr": DATASETS_ROOT / "TNR" / "out_tnr" / "tnr_satire_dataset.csv",
}

for k, p in PATHS.items():
    print(f"{k:8s} -> exists={p.exists()}  path={p}")

VERACITY_MODEL_DIR  = Path("..") / "binary" / "models" / "veracity_roberta"
CLICKBAIT_MODEL_DIR = Path("..") / "clickbait" / "models" /"rocloco_roberta_clickbait"

SOURCE_TABLE_PATH = Path("..") / "source_veracity" / "out_source_prior" / "source_veracity_table.csv"
print("SOURCE_TABLE exists:", SOURCE_TABLE_PATH.exists(), SOURCE_TABLE_PATH)

afp      -> exists=True  path=..\dataset-creation\AFP\out_afp_verificat\afp_verificat_dataset.csv
factual  -> exists=True  path=..\dataset-creation\Factual\data\factual_ro_dataset_postprocessed.csv
veridica -> exists=True  path=..\dataset-creation\Veridica\data_veridica\veridica_dataset.csv
ocr      -> exists=True  path=..\dataset-creation\Pseudo-FakeRom\ocr_fake_news_dataset.csv
tnr      -> exists=True  path=..\dataset-creation\TNR\out_tnr\tnr_satire_dataset.csv
SOURCE_TABLE exists: True ..\source_veracity\out_source_prior\source_veracity_table.csv


## Utils

In [3]:
import re
from urllib.parse import urlparse

def normalize_ws(s: str) -> str:
    return " ".join((s or "").split())

def safe_str(x) -> str:
    import numpy as _np
    return "" if x is None or (isinstance(x, float) and _np.isnan(x)) else str(x)

def normalize_label(s: str) -> str:
    return normalize_ws(s).upper()

def md5(s: str) -> str:
    return hashlib.md5((s or "").encode("utf-8")).hexdigest()

def get_domain(u: str) -> str:
    u = safe_str(u).strip()
    if not u:
        return ""
    try:
        d = urlparse(u).netloc.lower()
        return d.replace("www.", "")
    except Exception:
        return ""

def clamp(p: float, eps: float = 1e-6) -> float:
    return float(max(eps, min(1.0 - eps, p)))

def logit(p: float) -> float:
    p = clamp(p)
    return math.log(p / (1 - p))

PLATFORM_DOMAINS = {
    "facebook.com","m.facebook.com",
    "tiktok.com",
    "youtube.com","youtu.be",
    "twitter.com","x.com",
    "instagram.com",
    "reddit.com",
    "telegram.org","t.me",
}

## Dataset loaders

In [4]:
TRUE_SET_RO = {"ADEVĂRAT", "ADEVARAT", "PARȚIAL ADEVĂRAT", "PARTIAL ADEVARAT", "PARTIAL ADEVĂRAT", "REAL", "TRUE"}
FALSE_SET_RO = {
    "FALS", "TRUNCHIAT", "ÎNȘELĂTOR", "INȘELĂTOR", "INSELATOR", "CONTEXT LIPSĂ", "CONTEXT LIPSA",
    "LIPSA CONTEXTULUI", "FOTOGRAFIE ALTERATĂ", "FOTOGRAFIE ALTERATA",
    "VIDEOCLIP ALTERAT", "VIDEO ALTERAT", "DEEPFAKE", "SATIRĂ", "SATIRA", "SATIRE", "FARSĂ", "FARSA",
    "FAKE", "FALSE", "FAKE NEWS", "DEZINFORMARE", "FABRICATED", "PROPAGANDA", "PROPAGANDĂ", "PROPAGANDĂ DE RĂZBOI"
}
UNVERIFIABLE_SET_RO = {"IMPOSIBIL DE VERIFICAT", "S-A RĂZGÂNDIT", "S-A RAZGANDIT", "PLAUSIBLE"}

def map_label_binary(label_fine: str):
    L = normalize_label(label_fine)
    if L in TRUE_SET_RO:
        return 1
    if L in FALSE_SET_RO:
        return 0
    if L in UNVERIFIABLE_SET_RO:
        return None
    return None

def load_afp(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "afp"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = df.get("url", "").fillna("").apply(safe_str)
    out["source_url"] = df.get("source_url", "").fillna("").apply(safe_str)
    out["source_domain"] = out["source_url"].apply(get_domain)
    out["title"] = df.get("title", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["claim"] = df.get("claim", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["label_fine"] = df.get("label_norm", df.get("label", "")).fillna("").apply(safe_str)
    out["y"] = out["label_fine"].apply(map_label_binary)
    out["text_long"] = ""
    out["text_short"] = (out["title"].where(out["title"].str.len()>0, out["claim"]) + " [SEP] " + out["claim"]).str.strip()
    return out

def load_factual(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "factual"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = df.get("url", "").fillna("").apply(safe_str)
    out["source_url"] = df.get("source", df.get("speaker_url", "")).fillna("").apply(safe_str)
    out["source_domain"] = out["source_url"].apply(get_domain)
    out["title"] = df.get("title", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["claim"] = df.get("claim", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["label_fine"] = df.get("label", "").fillna("").apply(safe_str)
    out["y"] = out["label_fine"].apply(map_label_binary)
    out["text_long"] = df.get("text", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["text_short"] = (out["title"].where(out["title"].str.len()>0, out["claim"]) + " [SEP] " + out["claim"]).str.strip()
    return out

def load_veridica(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "veridica"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = df.get("url", "").fillna("").apply(safe_str)
    out["source_url"] = ""
    out["source_domain"] = out["url"].apply(get_domain)
    out["title"] = df.get("title", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["claim"] = df.get("claim", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["label_fine"] = df.get("label", "").fillna("").apply(safe_str)
    out["y"] = 0  # your current assumption for this scrape
    out["text_long"] = df.get("text", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["text_short"] = (out["title"].where(out["title"].str.len()>0, out["claim"]) + " [SEP] " + out["claim"]).str.strip()
    return out

def load_ocr(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "ocr"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = ""
    out["source_url"] = ""
    out["source_domain"] = ""
    out["title"] = ""
    out["claim"] = ""
    out["label_fine"] = df.get("label", "").fillna("").apply(safe_str)
    if "label_group" in df.columns:
        lg = df["label_group"].fillna("").apply(safe_str).str.upper()
        out["y"] = lg.map({"REAL": 1, "TRUE": 1, "FAKE": 0, "FALSE": 0})
    else:
        out["y"] = out["label_fine"].apply(map_label_binary)
    out["text_long"] = df.get("text", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["text_short"] = ""
    return out

def load_tnr(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    out = pd.DataFrame()
    out["dataset"] = "tnr"
    out["id"] = df.get("id", pd.Series([None]*len(df))).astype(str)
    out["url"] = df.get("url", "").fillna("").apply(safe_str)
    out["source_url"] = out["url"]
    out["source_domain"] = out["url"].apply(get_domain)
    out["title"] = df.get("title", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["claim"] = ""
    out["label_fine"] = df.get("label", "SATIRE").fillna("").apply(safe_str)
    out["y"] = out["label_fine"].apply(map_label_binary)
    out["text_long"] = df.get("text", "").fillna("").apply(safe_str).apply(normalize_ws)
    out["text_short"] = out["title"]
    return out

dfs = []
if PATHS["afp"].exists(): dfs.append(load_afp(PATHS["afp"]))
if PATHS["factual"].exists(): dfs.append(load_factual(PATHS["factual"]))
if PATHS["veridica"].exists(): dfs.append(load_veridica(PATHS["veridica"]))
if PATHS["ocr"].exists(): dfs.append(load_ocr(PATHS["ocr"]))
if PATHS["tnr"].exists(): dfs.append(load_tnr(PATHS["tnr"]))

data = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print("Unified rows:", len(data))
display(data["dataset"].value_counts())
display(data.head(3))

Unified rows: 2125


Series([], Name: count, dtype: int64)

Unnamed: 0,dataset,id,url,source_url,source_domain,title,claim,label_fine,y,text_long,text_short
0,,ec5256cca1a28dddab1e6bcea06d7698042da8cb,https://verificat.afp.com/doc.afp.com.32AD84J,https://www.facebook.com/mariana.muntean/posts...,facebook.com,Această înregistrare video nu este o dovadă că...,Acest videoclip arată că aterizarea pe Lună a ...,CONTEXT LIPSĂ,0.0,,Această înregistrare video nu este o dovadă că...
1,,4010817634d8d58df5de9d932abace7d2a8ec2f5,https://verificat.afp.com/doc.afp.com.32BY897,https://www.facebook.com/Lupul.Dacic.blog/post...,facebook.com,Focarele de variola maimuței nu sunt legate de...,Variola maimuței este provocată de vaccinul As...,CONTEXT LIPSĂ,0.0,,Focarele de variola maimuței nu sunt legate de...
2,,d11422ae41ff6c23d485c27ccc49b8b1a8e55d88,https://verificat.afp.com/doc.afp.com.32CX3EH,https://www.facebook.com/permalink.php?story_f...,facebook.com,Videoclipul care arată o „păpușă Ken însărcina...,"Aceasta este o nouă păpușă Ken, care este însă...",SATIRĂ,0.0,,Videoclipul care arată o „păpușă Ken însărcina...


## Text inputs for each model

In [5]:
data["text_long"] = data["text_long"].fillna("").astype(str)
data["text_short"] = data["text_short"].fillna("").astype(str)
data["title"] = data["title"].fillna("").astype(str)
data["claim"] = data["claim"].fillna("").astype(str)

def choose_binary_input(row) -> str:
    tl = row["text_long"]
    ts = row["text_short"]
    if isinstance(tl, str) and len(tl) >= 200:
        header = ts.strip()
        if header:
            return f"[SHORT] {header}\n[LONG] {tl}".strip()
        return tl.strip()
    return ts.strip()

SENT_SPLIT_RE = re.compile(r"(?<=[\.!?])\s+")

def choose_clickbait_input(row, max_chars: int = 250) -> str:
    t = row.get("title","").strip()
    if t:
        return t[:max_chars]
    c = row.get("claim","").strip()
    if c:
        return c[:max_chars]
    body = row.get("text_long","").strip()
    if body:
        first = SENT_SPLIT_RE.split(body)[0].strip()
        return first[:max_chars] if first else body[:max_chars]
    return ""

data["text_input_veracity"] = data.apply(choose_binary_input, axis=1).fillna("").astype(str)
data["text_input_clickbait"] = data.apply(choose_clickbait_input, axis=1).fillna("").astype(str)
data["text_len"] = data["text_input_veracity"].str.len()
data["has_source_domain"] = data["source_domain"].fillna("").astype(str).str.len().gt(0).astype(int)

data_bin = data[data["y"].isin([0,1])].copy()
data_bin = data_bin[data_bin["text_len"] >= 30].copy()

data_bin["text_hash"] = data_bin["text_input_veracity"].apply(lambda s: md5(normalize_ws(s).lower()[:2000]))
print("Binary rows for fusion:", len(data_bin))
display(data_bin[["dataset","y"]].value_counts().head(20))

Binary rows for fusion: 2118


Series([], Name: count, dtype: int64)

## Train-val-test splits

In [6]:
from sklearn.model_selection import GroupShuffleSplit

gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss1.split(data_bin, groups=data_bin["text_hash"]))
train = data_bin.iloc[train_idx].copy()
test = data_bin.iloc[test_idx].copy()

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
tr_idx, val_idx = next(gss2.split(train, groups=train["text_hash"]))
train2 = train.iloc[tr_idx].copy()
val = train.iloc[val_idx].copy()

print("Train:", len(train2), "Val:", len(val), "Test:", len(test))
print("Train label:", train2["y"].value_counts().to_dict())
print("Val   label:", val["y"].value_counts().to_dict())
print("Test  label:", test["y"].value_counts().to_dict())

Train: 1355 Val: 339 Test: 424
Train label: {0.0: 1074, 1.0: 281}
Val   label: {0.0: 280, 1.0: 59}
Test  label: {0.0: 337, 1.0: 87}


## Load source_veracity table

In [7]:
import numpy as np
import pandas as pd

source_tbl = pd.read_csv(SOURCE_TABLE_PATH, encoding="utf-8")
source_tbl["source_domain"] = source_tbl["source_domain"].fillna("").astype(str).str.lower().str.replace("www.","", regex=False)
DOMAIN2SCORE = dict(zip(source_tbl["source_domain"], source_tbl["source_score_final"]))

def source_score_for_domain(domain: str) -> float:
    d = (domain or "").strip().lower().replace("www.","")
    if not d:
        return 0.0
    if d in PLATFORM_DOMAINS:
        return 0.0
    return float(DOMAIN2SCORE.get(d, 0.0))

for df_ in (train2, val, test):
    df_["source_score"] = df_["source_domain"].apply(source_score_for_domain)
    df_["p_true_source"] = df_["source_score"].apply(lambda z: 1/(1+np.exp(-z)))

display(train2[["source_domain","source_score","p_true_source"]].head(5))

Unnamed: 0,source_domain,source_score,p_true_source
0,facebook.com,0.0,0.5
1,facebook.com,0.0,0.5
2,facebook.com,0.0,0.5
4,facebook.com,0.0,0.5
5,facebook.com,0.0,0.5


## Load transformer models

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

def load_hf_binary_model(model_dir: Path):
    tok = AutoTokenizer.from_pretrained(model_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_dir)
    mdl.to(device)
    mdl.eval()
    return tok, mdl

ver_tok, ver_mdl = load_hf_binary_model(VERACITY_MODEL_DIR)
cb_tok, cb_mdl = load_hf_binary_model(CLICKBAIT_MODEL_DIR)

@torch.no_grad()
def predict_proba(tok, mdl, texts, batch_size: int = 16, max_length: int = 512):
    probs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting", leave=False):
        batch = texts[i:i+batch_size]
        enc = tok(
            batch,
            truncation=True,
            max_length=max_length,
            padding=True,
            return_tensors="pt",
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        out = mdl(**enc)
        logits = out.logits
        if logits.shape[-1] == 1:
            p1 = torch.sigmoid(logits).squeeze(-1)
        else:
            p1 = torch.softmax(logits, dim=-1)[:, 1]
        probs.extend(p1.detach().cpu().numpy().tolist())
    return np.array(probs, dtype=float)

train2["p_true_content"] = predict_proba(ver_tok, ver_mdl, train2["text_input_veracity"].tolist(), batch_size=16, max_length=512)
val["p_true_content"]    = predict_proba(ver_tok, ver_mdl, val["text_input_veracity"].tolist(), batch_size=16, max_length=512)
test["p_true_content"]   = predict_proba(ver_tok, ver_mdl, test["text_input_veracity"].tolist(), batch_size=16, max_length=512)

train2["p_clickbait"] = predict_proba(cb_tok, cb_mdl, train2["text_input_clickbait"].tolist(), batch_size=32, max_length=128)
val["p_clickbait"]    = predict_proba(cb_tok, cb_mdl, val["text_input_clickbait"].tolist(), batch_size=32, max_length=128)
test["p_clickbait"]   = predict_proba(cb_tok, cb_mdl, test["text_input_clickbait"].tolist(), batch_size=32, max_length=128)

display(train2[["p_true_content","p_clickbait","source_score","y"]].head(5))

Device: cuda


Predicting:   0%|          | 0/85 [00:00<?, ?it/s]

Predicting:   0%|          | 0/22 [00:00<?, ?it/s]

Predicting:   0%|          | 0/27 [00:00<?, ?it/s]

Predicting:   0%|          | 0/43 [00:00<?, ?it/s]

Predicting:   0%|          | 0/11 [00:00<?, ?it/s]

Predicting:   0%|          | 0/14 [00:00<?, ?it/s]

Unnamed: 0,p_true_content,p_clickbait,source_score,y
0,0.000636,0.155183,0.0,0.0
1,0.001183,0.091858,0.0,0.0
2,0.000743,0.14292,0.0,0.0
4,0.00067,0.13545,0.0,0.0
5,0.000734,0.136982,0.0,0.0


## Logistic regression + Grid Search

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score

def make_fusion_X(df: pd.DataFrame) -> pd.DataFrame:
    X = pd.DataFrame()
    X["logit_p_true_content"] = df["p_true_content"].apply(lambda p: logit(float(p)))
    X["logit_p_not_clickbait"] = df["p_clickbait"].apply(lambda p: logit(1.0 - float(p)))
    X["source_score"] = df["source_score"].astype(float)
    X["text_len"] = df["text_len"].astype(float)
    X["has_source"] = df["has_source_domain"].astype(float)
    return X

X_tr = make_fusion_X(train2)
y_tr = train2["y"].astype(int).values
X_va = make_fusion_X(val)
y_va = val["y"].astype(int).values
X_te = make_fusion_X(test)
y_te = test["y"].astype(int).values

pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(
        max_iter=5000,
        class_weight="balanced",
        solver="lbfgs",
    ))
])

grid = GridSearchCV(
    pipe,
    param_grid={"clf__C":[0.1,0.3,1.0,3.0,10.0]},
    scoring="f1_macro",
    cv=5,
    n_jobs=-1,
    verbose=1,
)
grid.fit(X_tr, y_tr)
fusion_model = grid.best_estimator_
print("Best params:", grid.best_params_)
print("Best CV f1_macro:", grid.best_score_)

p_va = fusion_model.predict_proba(X_va)[:,1]
pred_va = (p_va >= 0.5).astype(int)
print("VAL @0.5")
print(classification_report(y_va, pred_va, target_names=["FALSE","TRUE"]))

p_te = fusion_model.predict_proba(X_te)[:,1]
pred_te = (p_te >= 0.5).astype(int)
print("TEST @0.5")
print(classification_report(y_te, pred_te, target_names=["FALSE","TRUE"]))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params: {'clf__C': 0.1}
Best CV f1_macro: 0.9796593534486184
VAL @0.5
              precision    recall  f1-score   support

       FALSE       1.00      1.00      1.00       280
        TRUE       0.98      0.98      0.98        59

    accuracy                           0.99       339
   macro avg       0.99      0.99      0.99       339
weighted avg       0.99      0.99      0.99       339

TEST @0.5
              precision    recall  f1-score   support

       FALSE       1.00      0.99      0.99       337
        TRUE       0.95      0.99      0.97        87

    accuracy                           0.99       424
   macro avg       0.97      0.99      0.98       424
weighted avg       0.99      0.99      0.99       424



## Pick decision threshold (based on F1)

In [12]:
import numpy as np
from sklearn.metrics import classification_report, f1_score

def best_threshold(y_true, p_true):
    best_t, best_s = 0.5, -1
    for t in np.linspace(0.05, 0.95, 181):
        y_pred = (p_true >= t).astype(int)
        s = f1_score(y_true, y_pred, average="macro")
        if s > best_s:
            best_s, best_t = s, float(t)
    return best_t, best_s

t_star, s_star = best_threshold(y_va, p_va)
print("Best threshold (val, f1_macro):", t_star, "score:", s_star)

pred_te_star = (p_te >= t_star).astype(int)
print("TEST @t*")
print(classification_report(y_te, pred_te_star, target_names=["FALSE","TRUE"]))

Best threshold (val, f1_macro): 0.3549999999999999 score: 0.9897397094430993
TEST @t*
              precision    recall  f1-score   support

       FALSE       1.00      0.98      0.99       337
        TRUE       0.93      0.99      0.96        87

    accuracy                           0.98       424
   macro avg       0.97      0.99      0.98       424
weighted avg       0.98      0.98      0.98       424



In [13]:
import joblib, datetime, json

model_path = OUT_FUSION_DIR / "fusion_lr.joblib"
joblib.dump(fusion_model, model_path)

threshold_payload = {
    "threshold": t_star,
    "selected_on": "val",
    "metric": "f1_macro",
    "val_score": s_star,
    "created_at_utc": datetime.datetime.utcnow().isoformat(timespec="seconds"),
    "features": [
        "logit_p_true_content",
        "logit_p_not_clickbait",
        "source_score",
        "text_len",
        "has_source",
    ],
}
(OUT_FUSION_DIR / "fusion_threshold.json").write_text(
    json.dumps(threshold_payload, ensure_ascii=False, indent=2),
    encoding="utf-8",
)

schema_payload = {
    "feature_order": threshold_payload["features"],
    "notes": "Features are scaled with StandardScaler. source_score is log-odds from the source prior table.",
}
(OUT_FUSION_DIR / "fusion_feature_schema.json").write_text(
    json.dumps(schema_payload, ensure_ascii=False, indent=2),
    encoding="utf-8",
)

print("Saved:", model_path.resolve())
print("Saved:", (OUT_FUSION_DIR / "fusion_threshold.json").resolve())
print("Saved:", (OUT_FUSION_DIR / "fusion_feature_schema.json").resolve())

Saved: D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\fusion\artifacts\fusion\fusion_lr.joblib
Saved: D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\fusion\artifacts\fusion\fusion_threshold.json
Saved: D:\Programming\AI\AI-Self\NLP-FakeNews-Detection-Classifier\fusion\artifacts\fusion\fusion_feature_schema.json


## Helper

In [14]:
def fusion_predict_one(p_true_content: float, p_clickbait: float, source_score: float, text_len: int, has_source: int):
    X = pd.DataFrame([{
        "logit_p_true_content": logit(float(p_true_content)),
        "logit_p_not_clickbait": logit(1.0 - float(p_clickbait)),
        "source_score": float(source_score),
        "text_len": float(text_len),
        "has_source": float(has_source),
    }])
    return float(fusion_model.predict_proba(X)[:,1][0])

row = train2.iloc[0]
p = fusion_predict_one(
    row["p_true_content"], row["p_clickbait"], row["source_score"],
    int(row["text_len"]), int(row["has_source_domain"])
)
print("Example final_p_true:", p, "y:", int(row["y"]))

Example final_p_true: 0.012740228644084068 y: 0
