# Ensemble Meta-Learner: Stacked Generalization

Logistic regression over OOF probabilities from three complementary base learners:

| Model | Signal | Source |
|-------|--------|--------|
| **XGBoost** | Metadata & temporal | 35 features |
| **XLM-R** | Linguistic | Post-text [CLS] embeddings |
| **BotRGCN** | Relational | GNN on behavioural kNN graph |

**OOF protocol.** Every user's base-learner probability comes from a model that never saw that user during training (5-fold stratified CV for all three). The meta-learner trains on all 889 users; nested CV yields an unbiased performance estimate.

**Prerequisites:** run `bot_detector.ipynb`, `bot_detector_xlmr.ipynb`, and `bot_detector_rgcn.ipynb` first.


In [63]:
# === Setup ===
import json
import re
import os
import random
from pathlib import Path
from datetime import datetime
from collections import defaultdict

import numpy as np
import pandas as pd
import torch

DATA_DIR = Path(".")
DATASET_IDS = (30, 31, 32, 33)
RANDOM_STATE = 42
K = 5  # number of CV folds
KNN_K = 15  # kNN neighbours per relation in GNN graph
GNN_HIDDEN = 128  # GNN hidden dimension
GNN_EPOCHS = 200  # GNN training epochs
ARTIFACTS_BASE_DIR = Path("artifacts") / "ensemble"


def set_global_seed(seed: int) -> None:
    """Best-effort deterministic seeding across numpy/python/torch."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def ensure_run_dir(seed: int, calibration_mode: str = "raw") -> Path:
    """Create a timestamped artifact directory for this run."""
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = ARTIFACTS_BASE_DIR / f"run_{ts}_seed{seed}_{calibration_mode}"
    run_dir.mkdir(parents=True, exist_ok=True)
    return run_dir


def resolve_best_xlmr_checkpoint(fold: int, phase_dir_root: Path = Path("xlmr_cv")) -> Path:
    """Resolve best checkpoint for a fold with robust fallback behavior."""
    fold_dir = phase_dir_root / f"fold{fold}_phase3"
    if not fold_dir.exists():
        raise FileNotFoundError(f"Missing XLM-R fold directory: {fold_dir}")

    trainer_states = sorted(fold_dir.glob("checkpoint-*/trainer_state.json"))
    if trainer_states:
        # Prefer latest trainer_state.json, then use its best_model_checkpoint.
        state_path = trainer_states[-1]
        with open(state_path) as f:
            trainer_state = json.load(f)
        best_ckpt = trainer_state.get("best_model_checkpoint")
        if best_ckpt:
            best_path = Path(best_ckpt)
            if best_path.exists():
                return best_path

    ckpt_dirs = sorted(
        [p for p in fold_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]),
    )
    if not ckpt_dirs:
        raise FileNotFoundError(
            f"No valid checkpoint directories found for fold {fold} in {fold_dir}"
        )
    return ckpt_dirs[-1]


def load_bot_ids(dataset_id: int) -> set:
    path = DATA_DIR / f"dataset.bots.{dataset_id}.txt"
    if not path.exists():
        return set()
    with open(path) as f:
        return set(line.strip() for line in f if line.strip())


def load_posts_and_users(dataset_id: int) -> dict:
    path = DATA_DIR / f"dataset.posts&users.{dataset_id}.json"
    with open(path) as f:
        return json.load(f)


set_global_seed(RANDOM_STATE)
print("Setup complete.")

Setup complete.


## 1. XGBoost Out-of-Fold Predictions

Reproduce the exact feature engineering from `bot_detector.ipynb`, then run 5-Fold
Stratified CV collecting **validation probabilities** from each fold.

*Feature pipeline must match the source notebook exactly; otherwise the OOF estimates are not comparable across runs.*

In [64]:
# XGBoost 5-Fold OOF Predictions
from collections import defaultdict
from scipy.stats import entropy as scipy_entropy
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import xgboost as xgb

def extract_user_features(posts, users, bot_ids):
    by_author = defaultdict(list)
    for p in posts:
        by_author[p["author_id"]].append(p)
    rows = []
    for author_id, author_posts in by_author.items():
        texts = [p["text"] for p in author_posts]
        created = [pd.to_datetime(p["created_at"]) for p in author_posts]
        row = {"author_id": author_id, "post_count": len(author_posts),
               "avg_text_length": np.mean([len(t) for t in texts]),
               "unique_langs": len(set(p.get("lang", "") for p in author_posts)),
               "has_url_ratio": np.mean([1 if "http" in t else 0 for t in texts]),
               "is_bot": 1 if author_id in bot_ids else 0}
        if len(created) > 1:
            created.sort()
            gaps = np.diff(created).astype("timedelta64[s]").astype(float)
            row["avg_post_gap_seconds"] = np.mean(gaps)
            row["min_post_gap_seconds"] = np.min(gaps)
        else:
            row["avg_post_gap_seconds"] = np.nan
            row["min_post_gap_seconds"] = np.nan
        rows.append(row)
    return pd.DataFrame(rows)

def levenshtein_distance(a, b):
    if len(a) < len(b):
        a, b = b, a
    prev = list(range(len(b) + 1))
    for i, ca in enumerate(a):
        curr = [i + 1]
        for j, cb in enumerate(b):
            substitution_cost = 0 if ca == cb else 1
            curr.append(
                min(
                    prev[j] + substitution_cost,
                    prev[j + 1] + 1,
                    curr[j] + 1,
                )
            )
        prev = curr
    return prev[-1]


def digit_density(s):
    return sum(ch.isdigit() for ch in s) / len(s) if s else 0.0


def iat_entropy(timestamps, n_bins=20):
    if len(timestamps) < 3:
        return np.nan
    gaps = np.diff(sorted(timestamps)).astype("timedelta64[s]").astype(float)
    counts, _ = np.histogram(gaps, bins=n_bins)
    return float(scipy_entropy(counts, base=2))


def burstiness(timestamps):
    if len(timestamps) < 3:
        return np.nan
    gaps = np.diff(sorted(timestamps)).astype("timedelta64[s]").astype(float)
    mu, sigma = np.mean(gaps), np.std(gaps, ddof=1)
    return (sigma - mu) / (sigma + mu) if (sigma + mu) != 0 else 0.0


def activity_vector(timestamps):
    vec = [0] * 24
    for ts in timestamps:
        vec[ts.hour] += 1
    return vec


def lang_mismatch_ratio(author_posts, expected_lang):
    if not author_posts:
        return 0.0
    mismatch_count = sum(
        1 for p in author_posts if p.get("lang", expected_lang) != expected_lang
    )
    return mismatch_count / len(author_posts)


def screen_name_entropy(screen_name, normalize=True):
    if not screen_name:
        return 0.0
    s = screen_name.strip().lower()
    if not s:
        return 0.0

    counts = {}
    for ch in s:
        counts[ch] = counts.get(ch, 0) + 1

    n = len(s)
    ent = -sum((c / n) * np.log2(c / n) for c in counts.values())
    if normalize and len(counts) > 1:
        ent /= np.log2(len(counts))
    return ent

def build_feature_df(dataset_id):
    bot_ids = load_bot_ids(dataset_id)
    data = load_posts_and_users(dataset_id)
    posts, users = data["posts"], data["users"]
    dataset_lang = data.get("lang", "en")

    df = extract_user_features(posts, users, bot_ids)

    name_distance_map = {
        u["id"]: levenshtein_distance(
            (u.get("username", "") or "").lower(),
            (u.get("name", "") or "").lower(),
        )
        for u in users
    }
    digit_density_map = {
        u["id"]: digit_density(u.get("username", ""))
        for u in users
    }

    df["levenshtein_name_dist"] = df["author_id"].map(name_distance_map)
    df["digit_density"] = df["author_id"].map(digit_density_map)

    by_author_timestamps = defaultdict(list)
    for p in posts:
        by_author_timestamps[p["author_id"]].append(pd.to_datetime(p["created_at"]))

    df["iat_entropy"] = df["author_id"].map(
        {aid: iat_entropy(ts) for aid, ts in by_author_timestamps.items()}
    )
    df["burstiness"] = df["author_id"].map(
        {aid: burstiness(ts) for aid, ts in by_author_timestamps.items()}
    )

    activity_df = pd.DataFrame.from_dict(
        {aid: activity_vector(ts) for aid, ts in by_author_timestamps.items()},
        orient="index",
        columns=[f"hour_{h}" for h in range(24)],
    )
    activity_df.index.name = "author_id"
    activity_df = activity_df.reset_index()
    df = df.merge(activity_df, on="author_id", how="left")

    by_author_posts = defaultdict(list)
    for p in posts:
        by_author_posts[p["author_id"]].append(p)

    df["lang_mismatch_ratio"] = df["author_id"].map(
        {aid: lang_mismatch_ratio(ap, dataset_lang) for aid, ap in by_author_posts.items()}
    )
    df["screen_name_entropy"] = df["author_id"].map(
        {u["id"]: screen_name_entropy(u.get("username", "")) for u in users}
    )
    return df

# Build combined DataFrame
combined_df = pd.concat([build_feature_df(did) for did in DATASET_IDS], ignore_index=True)

exclude_cols = {"author_id", "is_bot", "min_post_gap_seconds"}
feature_cols = [c for c in combined_df.columns if c not in exclude_cols]

print(f"Combined: {len(combined_df)} users  ({combined_df['is_bot'].sum()} bots)")
print(f"Features: {len(feature_cols)}")

# XGBoost 5-Fold OOF
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=RANDOM_STATE)
X_all = combined_df[feature_cols].fillna(0)
y_all = combined_df["is_bot"].values
xgb_oof_probs = np.zeros(len(combined_df), dtype=np.float64)
xgb_oof_fold = np.full(len(combined_df), -1, dtype=np.int32)

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_all, y_all), 1):
    X_tr, X_val = X_all.iloc[tr_idx], X_all.iloc[val_idx]
    y_tr, y_val = y_all[tr_idx], y_all[val_idx]
    fold_scale = (y_tr == 0).sum() / max(y_tr.sum(), 1)
    fold_model = xgb.XGBClassifier(n_estimators=500, max_depth=4, learning_rate=0.05,
                                    scale_pos_weight=fold_scale, colsample_bytree=0.5,
                                    eval_metric="logloss", early_stopping_rounds=30,
                                    random_state=RANDOM_STATE)
    fold_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
    xgb_oof_probs[val_idx] = fold_model.predict_proba(X_val)[:, 1]
    xgb_oof_fold[val_idx] = fold
    f1 = f1_score(y_val, (xgb_oof_probs[val_idx] >= 0.5).astype(int), average="binary")
    print(f"Fold {fold}: F1={f1:.4f}  (val size={len(val_idx)})")

assert np.isfinite(xgb_oof_probs).all(), "XGBoost OOF contains non-finite values"
assert (xgb_oof_fold > 0).all(), "Some users are missing XGBoost OOF fold assignments"

xgb_oof_f1 = f1_score(y_all, (xgb_oof_probs >= 0.5).astype(int), average="binary")
print(f"\nXGBoost OOF F1 (all {len(combined_df)} users): {xgb_oof_f1:.4f}")

Combined: 889 users  (184 bots)
Features: 35
Fold 1: F1=0.9041  (val size=178)
Fold 2: F1=0.8358  (val size=178)
Fold 3: F1=0.8732  (val size=178)
Fold 4: F1=0.8219  (val size=178)
Fold 5: F1=0.8824  (val size=177)

XGBoost OOF F1 (all 889 users): 0.8636


## 2. XLM-R Out-of-Fold Predictions

Load the saved fold checkpoints from `xlmr_cv/fold{k}_phase3/` and predict on each
fold's validation users. Apply softmax to convert logits → P(bot).

*Each fold uses its own best checkpoint (by val loss); we predict only on that fold’s held-out users. No model ever sees its prediction targets during training.*

In [65]:
# XLM-R OOF: Build text DataFrame + Load Fold Checkpoints ===
import emoji
import gc
from transformers import XLMRobertaTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from torch.utils.data import DataLoader
from scipy.special import softmax

MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 512
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding="max_length")

def preprocess_text(text):
    if not text or not isinstance(text, str):
        return ""
    t = text.strip()
    t = emoji.demojize(t, delimiters=(":", ":"))
    t = re.sub(r"https?://\S+", "<URL>", t)
    t = re.sub(r"@[A-Za-z0-9_]+", "<USER>", t)
    return t

def build_user_text_df(dataset_id):
    bot_ids = load_bot_ids(dataset_id)
    data = load_posts_and_users(dataset_id)
    by_author = defaultdict(list)
    for p in data["posts"]:
        by_author[p["author_id"]].append(preprocess_text(p.get("text", "") or ""))
    rows = []
    for aid, texts in by_author.items():
        text = " [SEP] ".join(t for t in texts if t)
        rows.append({"author_id": aid, "text": text if text else "", "is_bot": 1 if aid in bot_ids else 0})
    return pd.DataFrame(rows)

text_combined_df = pd.concat([build_user_text_df(did) for did in DATASET_IDS], ignore_index=True)

# Keep fold ordering identical to the original XLM-R export to avoid
# checkpoint/fold mismatch and label leakage.
_xlmr_npz = np.load("rgcn_features/xlmr_features.npz", allow_pickle=True)
canonical_xlmr_aids = list(_xlmr_npz["author_ids"])
assert set(canonical_xlmr_aids) == set(text_combined_df["author_id"].unique()), \
    "User set mismatch between XLMR export and ensemble text data"
bot_lookup = text_combined_df.drop_duplicates("author_id").set_index("author_id")["is_bot"]
cv_user_ids = pd.DataFrame({
    "author_id": canonical_xlmr_aids,
    "is_bot": [int(bot_lookup[a]) for a in canonical_xlmr_aids],
})
print(f"XLMR fold alignment: using canonical ordering from xlmr_features.npz ({len(cv_user_ids)} users)")

device = torch.device("mps" if torch.backends.mps.is_available()
                      else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# Collect OOF probabilities
skf_xlmr = StratifiedKFold(n_splits=K, shuffle=True, random_state=RANDOM_STATE)
xlmr_oof_probs = np.full(len(text_combined_df), np.nan, dtype=np.float64)
xlmr_oof_fold = np.full(len(text_combined_df), -1, dtype=np.int32)

for fold, (tr_idx, val_idx) in enumerate(skf_xlmr.split(cv_user_ids, cv_user_ids["is_bot"]), 1):
    fold_val_ids = set(cv_user_ids.iloc[val_idx]["author_id"])
    fold_val_mask = text_combined_df["author_id"].isin(fold_val_ids)
    fold_val_df = text_combined_df[fold_val_mask].reset_index(drop=True)

    best_ckpt = resolve_best_xlmr_checkpoint(fold)
    print(f"Fold {fold}: Loading {best_ckpt}")

    fold_model = AutoModelForSequenceClassification.from_pretrained(best_ckpt, num_labels=2)
    fold_model = fold_model.to(device)
    fold_model.eval()

    fold_val_ds = Dataset.from_pandas(fold_val_df[["text"]].reset_index(drop=True))
    fold_val_ds = fold_val_ds.map(tokenize, batched=True, remove_columns=["text"])
    fold_val_ds.set_format("torch")

    fold_logits = []
    with torch.no_grad():
        for batch in DataLoader(fold_val_ds, batch_size=8, shuffle=False):
            batch = {k: v.to(device) for k, v in batch.items()}
            fold_logits.append(fold_model(**batch).logits.cpu().numpy())

    fold_logits = np.concatenate(fold_logits, axis=0)
    fold_probs = softmax(fold_logits, axis=1)[:, 1]

    val_positions = np.where(fold_val_mask.values)[0]
    xlmr_oof_probs[val_positions] = fold_probs
    xlmr_oof_fold[val_positions] = fold

    f1 = f1_score(fold_val_df["is_bot"].values, (fold_probs >= 0.5).astype(int), average="binary")
    print(f"  Fold {fold} F1={f1:.4f}  (val size={len(fold_val_df)})")

    del fold_model, fold_logits, fold_val_ds
    gc.collect()
    if device.type == "mps": torch.mps.empty_cache()

assert not np.isnan(xlmr_oof_probs).any(), "Some users have no XLM-R OOF prediction!"
assert (xlmr_oof_fold > 0).all(), "Some users are missing XLM-R OOF fold assignments"
xlmr_oof_f1 = f1_score(text_combined_df["is_bot"].values,
                        (xlmr_oof_probs >= 0.5).astype(int), average="binary")
print(f"\nXLM-R OOF F1 (all {len(text_combined_df)} users): {xlmr_oof_f1:.4f}")

XLMR fold alignment: using canonical ordering from xlmr_features.npz (889 users)
Device: mps
Fold 1: Loading xlmr_cv/fold1_phase3/checkpoint-90


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1706.62it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Map: 100%|██████████| 178/178 [00:00<00:00, 1762.79 examples/s]


  Fold 1 F1=0.8947  (val size=178)
Fold 2: Loading xlmr_cv/fold2_phase3/checkpoint-90


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1746.87it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Map: 100%|██████████| 178/178 [00:00<00:00, 2143.18 examples/s]


  Fold 2 F1=0.9189  (val size=178)
Fold 3: Loading xlmr_cv/fold3_phase3/checkpoint-135


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1736.12it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Map: 100%|██████████| 178/178 [00:00<00:00, 2050.20 examples/s]


  Fold 3 F1=0.9610  (val size=178)
Fold 4: Loading xlmr_cv/fold4_phase3/checkpoint-90


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1741.14it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Map: 100%|██████████| 178/178 [00:00<00:00, 2033.84 examples/s]


  Fold 4 F1=0.9114  (val size=178)
Fold 5: Loading xlmr_cv/fold5_phase3/checkpoint-90


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1749.36it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Map: 100%|██████████| 177/177 [00:00<00:00, 2122.32 examples/s]


  Fold 5 F1=0.8485  (val size=177)

XLM-R OOF F1 (all 889 users): 0.9086


## 3. BotRGCN Out-of-Fold Predictions (K-Fold Transductive)

All 889 nodes stay in the graph (message passing needs the full topology).
For each fold we rotate which nodes compute loss **and rebuild the kNN graph**
so that val nodes connect only to that fold's train nodes. Each node gets a
prediction from a GNN that **never optimized for it**.

**Leakage hardening in this section:**
- **Fold-safe feature scaling:** metadata normalization is fit on each fold's train nodes only, then applied to all nodes for that fold.
- **No outer-val model selection:** we do not early-stop or checkpoint-pick using outer validation labels; each fold trains for a fixed schedule and predicts once.

In [66]:
# BotRGCN: Build node features + Precompute graph components ===
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv
from torch_geometric.data import Data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from dateutil import parser as dtparser

# Load exported arrays with aligned user IDs and labels.
xgb_data = np.load("rgcn_features/xgb_features.npz", allow_pickle=True)
xlmr_data = np.load("rgcn_features/xlmr_features.npz", allow_pickle=True)

xgb_aids = list(xgb_data["author_ids"])
xlmr_aids = list(xlmr_data["author_ids"])
xlmr_vecs = xlmr_data["xlmr_feature_vectors"]
is_bot_raw = xgb_data["is_bot"]

# Canonical node order shared across all inputs.
author_ids = sorted(set(xgb_aids) & set(xlmr_aids))
xgb_idx = {aid: i for i, aid in enumerate(xgb_aids)}
xlmr_idx = {aid: i for i, aid in enumerate(xlmr_aids)}
xlmr_feature_vectors = np.array(
    [xlmr_vecs[xlmr_idx[a]] for a in author_ids],
    dtype=np.float32,
)
is_bot = np.array([int(is_bot_raw[xgb_idx[a]]) for a in author_ids])

# Use raw metadata features from the XGBoost pipeline and apply fold-wise scaling later.
xgb_raw = combined_df.drop_duplicates("author_id").set_index("author_id")
xgb_feature_vectors = np.array(
    [xgb_raw.loc[a, feature_cols].values for a in author_ids],
    dtype=np.float32,
)
xgb_feature_vectors = np.nan_to_num(xgb_feature_vectors, 0.0)

num_users = xgb_feature_vectors.shape[0]
feature_dim = xgb_feature_vectors.shape[1] + xlmr_feature_vectors.shape[1]
author_id_to_idx = {aid: i for i, aid in enumerate(author_ids)}

print(f"Loaded {num_users} users, feature dim = {feature_dim}")
print(f"  Metadata features: {xgb_feature_vectors.shape[1]}d (raw; fold-scaled in Cell 8)")
print(f"  XLMR embeddings:   {xlmr_feature_vectors.shape[1]}d (OOF [CLS])")

# --- Precompute similarity matrices + explicit edges (fold-independent) ---
# kNN edge construction is deferred to Cell 8 (per-fold rebuild).

user_hashtags, user_hours, username_to_id = defaultdict(list), defaultdict(list), {}
for did in DATASET_IDS:
    data_raw = load_posts_and_users(did)
    username_to_id.update({u.get("username", "").lower(): u["id"]
                           for u in data_raw["users"] if u.get("username")})
    for p in data_raw["posts"]:
        aid = p["author_id"]
        if aid not in author_id_to_idx: continue
        text = p.get("text", "") or ""
        user_hashtags[aid].extend(t.lower() for t in re.findall(r"#([A-Za-z0-9_]+)", text))
        try: user_hours[aid].append(dtparser.isoparse(p["created_at"]).hour)
        except Exception: pass

def knn_edges_train_targets(sim_matrix, k, train_mask):
    """Directed kNN where every target is a train node (no val->val edges)."""
    src, tgt = [], []
    for i in range(sim_matrix.shape[0]):
        sims = sim_matrix[i].copy()
        sims[i] = -1; sims[~train_mask] = -1
        n_pos = (sims > 0).sum()
        if n_pos == 0: continue
        top_k = np.argpartition(sims, -min(k, n_pos))[-min(k, n_pos):]
        for j in top_k:
            if sims[j] > 0: src.append(i); tgt.append(j)
    return src, tgt

# Precompute similarity matrices (label-free, reused across folds)
ht_docs = [" ".join(user_hashtags[aid]) if user_hashtags[aid] else "" for aid in author_ids]
ht_sim = cosine_similarity(TfidfVectorizer(token_pattern=r"[a-z0-9_]+").fit_transform(ht_docs))

hour_hist = np.zeros((num_users, 24), dtype=np.float32)
for idx, aid in enumerate(author_ids):
    for h in user_hours[aid]: hour_hist[idx, h] += 1
norms = np.linalg.norm(hour_hist, axis=1, keepdims=True); norms[norms == 0] = 1
time_sim = (hour_hist / norms) @ (hour_hist / norms).T

# Precompute explicit edges (label-free, fixed across folds)
mention_src, mention_tgt, bio_src, bio_tgt = [], [], [], []
for did in DATASET_IDS:
    data_raw = load_posts_and_users(did)
    for p in data_raw["posts"]:
        sid = p["author_id"]
        if sid not in author_id_to_idx: continue
        for m in re.findall(r"@([A-Za-z0-9_]+)", p.get("text", "")):
            tid = username_to_id.get(m.lower())
            if tid and tid in author_id_to_idx and tid != sid:
                mention_src.append(author_id_to_idx[sid]); mention_tgt.append(author_id_to_idx[tid])
    for u in data_raw["users"]:
        sid = u["id"]
        if sid not in author_id_to_idx: continue
        for m in re.findall(r"@([A-Za-z0-9_]+)", u.get("description", "") or ""):
            tid = username_to_id.get(m.lower())
            if tid and tid in author_id_to_idx and tid != sid:
                bio_src.append(author_id_to_idx[sid]); bio_tgt.append(author_id_to_idx[tid])

NUM_RELATIONS = 4
print(f"Loaded {num_users} users, feature dim = {feature_dim}")
print(f"  Metadata features: {xgb_feature_vectors.shape[1]}d (raw; fold-scaled in Cell 8)")
print(f"  XLMR embeddings:   {xlmr_feature_vectors.shape[1]}d (OOF [CLS])")
print(f"Precomputed: ht_sim {ht_sim.shape}, time_sim {time_sim.shape}")
print(f"Explicit edges: {len(mention_src)} mentions, {len(bio_src)} bio-links")
print(f"kNN graph will be rebuilt per fold in Cell 8")


Loaded 889 users, feature dim = 803
  Metadata features: 35d (raw; fold-scaled in Cell 8)
  XLMR embeddings:   768d (OOF [CLS])
Loaded 889 users, feature dim = 803
  Metadata features: 35d (raw; fold-scaled in Cell 8)
  XLMR embeddings:   768d (OOF [CLS])
Precomputed: ht_sim (889, 889), time_sim (889, 889)
Explicit edges: 1 mentions, 2 bio-links
kNN graph will be rebuilt per fold in Cell 8


In [67]:
# === BotRGCN K-Fold OOF Predictions ===

class BotRGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim=64, out_dim=2, num_relations=2, dropout=0.3):
        super().__init__()
        self.conv1 = RGCNConv(in_dim, hidden_dim, num_relations=num_relations)
        self.conv2 = RGCNConv(hidden_dim, hidden_dim, num_relations=num_relations)
        self.classifier = nn.Linear(hidden_dim, out_dim)
        self.dropout = dropout
    def forward(self, x, edge_index, edge_type):
        x = F.dropout(F.relu(self.conv1(x, edge_index, edge_type)), p=self.dropout, training=self.training)
        x = F.dropout(F.relu(self.conv2(x, edge_index, edge_type)), p=self.dropout, training=self.training)
        return self.classifier(x)

def edge_dropout(ei, et, drop_rate=0.25):
    if drop_rate <= 0:
        return ei, et
    keep = torch.rand(ei.size(1), device=ei.device) >= drop_rate
    return ei[:, keep], et[keep]


def compute_class_weights(y, mask, dev):
    yt = y[mask]
    n = yt.size(0)
    nc = yt.unique().size(0)
    w = torch.ones(nc, device=dev)
    for c in range(nc):
        w[c] = n / (nc * (yt == c).sum().float().clamp(min=1))
    return w

rgcn_device = torch.device("mps" if torch.backends.mps.is_available()
                            else "cuda" if torch.cuda.is_available() else "cpu")
print(f"GNN device: {rgcn_device}")

y_gnn = torch.tensor(is_bot, dtype=torch.long)

# --- K-Fold OOF loop ---
skf_gnn = StratifiedKFold(n_splits=K, shuffle=True, random_state=RANDOM_STATE)
gnn_oof_probs = np.zeros(num_users, dtype=np.float64)
gnn_oof_fold = np.full(num_users, -1, dtype=np.int32)

for fold, (tr_idx, val_idx) in enumerate(skf_gnn.split(np.arange(num_users), is_bot), 1):
    fold_train_mask = torch.zeros(num_users, dtype=torch.bool)
    fold_train_mask[tr_idx] = True
    fold_val_mask = torch.zeros(num_users, dtype=torch.bool)
    fold_val_mask[val_idx] = True

    # Fit metadata scaler on this fold's train users only.
    fold_scaler = StandardScaler()
    fold_scaler.fit(xgb_feature_vectors[tr_idx])
    xgb_scaled_fold = fold_scaler.transform(xgb_feature_vectors).astype(np.float32)

    fold_node_features = np.concatenate(
        [xgb_scaled_fold, xlmr_feature_vectors],
        axis=1,
    ).astype(np.float32)
    fold_x = torch.from_numpy(fold_node_features)

    # Rebuild kNN edges each fold so validation nodes only point to train targets.
    fold_train_mask_np = np.zeros(num_users, dtype=bool)
    fold_train_mask_np[tr_idx] = True

    ht_src, ht_tgt = knn_edges_train_targets(ht_sim, KNN_K, fold_train_mask_np)
    ts_src, ts_tgt = knn_edges_train_targets(time_sim, KNN_K, fold_train_mask_np)

    # Fold-gate mention and bio edges: restrict targets to train nodes
    # (consistent with kNN edge handling to prevent val->val leakage).
    fold_mention_src = [s for s, t in zip(mention_src, mention_tgt) if fold_train_mask_np[t]]
    fold_mention_tgt = [t for s, t in zip(mention_src, mention_tgt) if fold_train_mask_np[t]]
    fold_bio_src = [s for s, t in zip(bio_src, bio_tgt) if fold_train_mask_np[t]]
    fold_bio_tgt = [t for s, t in zip(bio_src, bio_tgt) if fold_train_mask_np[t]]

    all_src = ht_src + ts_src + fold_mention_src + fold_bio_src
    all_tgt = ht_tgt + ts_tgt + fold_mention_tgt + fold_bio_tgt
    all_rel = (
        [0] * len(ht_src)
        + [1] * len(ts_src)
        + [2] * len(fold_mention_src)
        + [3] * len(fold_bio_src)
    )

    fold_edge_index = torch.tensor([all_src, all_tgt], dtype=torch.long)
    fold_edge_type = torch.tensor(all_rel, dtype=torch.long)
    print(f"  Fold {fold} graph: {fold_edge_index.size(1)} edges")

    fold_data = Data(
        x=fold_x.to(rgcn_device), edge_index=fold_edge_index.to(rgcn_device),
        edge_type=fold_edge_type.to(rgcn_device), y=y_gnn.clone().to(rgcn_device),
        train_mask=fold_train_mask.to(rgcn_device), val_mask=fold_val_mask.to(rgcn_device))

    fold_gnn = BotRGCN(in_dim=feature_dim, hidden_dim=GNN_HIDDEN, out_dim=2,
                        num_relations=NUM_RELATIONS, dropout=0.3).to(rgcn_device)
    cw = compute_class_weights(fold_data.y, fold_data.train_mask, rgcn_device)
    loss_fn = nn.CrossEntropyLoss(weight=cw)
    optimizer = torch.optim.Adam(fold_gnn.parameters(), lr=0.01, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=GNN_EPOCHS)

    for epoch in range(1, GNN_EPOCHS + 1):
        fold_gnn.train()
        optimizer.zero_grad()

        ei, et = edge_dropout(fold_data.edge_index, fold_data.edge_type, 0.25)
        logits_train = fold_gnn(fold_data.x, ei, et)[fold_data.train_mask]
        labels_train = fold_data.y[fold_data.train_mask]
        loss = loss_fn(logits_train, labels_train)

        loss.backward()
        optimizer.step()
        scheduler.step()

    # No outer-val checkpoint selection: evaluate exactly once after fixed training.
    fold_gnn.eval()
    with torch.no_grad():
        logits = fold_gnn(fold_data.x, fold_data.edge_index, fold_data.edge_type)
        probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
        preds = logits.argmax(dim=1).cpu().numpy()

    gnn_oof_probs[val_idx] = probs[val_idx]
    gnn_oof_fold[val_idx] = fold
    val_acc = (preds[val_idx] == is_bot[val_idx]).mean()
    val_f1 = f1_score(is_bot[val_idx], (probs[val_idx] >= 0.5).astype(int), average="binary")
    print(f"Fold {fold}: Val Acc={val_acc:.4f}  Val F1={val_f1:.4f}  (val size={len(val_idx)})")

    del fold_gnn, fold_data
    if rgcn_device.type == "mps": torch.mps.empty_cache()

assert np.isfinite(gnn_oof_probs).all(), "BotRGCN OOF contains non-finite values"
assert (gnn_oof_fold > 0).all(), "Some users are missing BotRGCN OOF fold assignments"

gnn_oof_f1 = f1_score(is_bot, (gnn_oof_probs >= 0.5).astype(int), average="binary")
print(f"\nBotRGCN OOF F1 (all {num_users} users): {gnn_oof_f1:.4f}")


GNN device: mps
  Fold 1 graph: 17133 edges
Fold 1: Val Acc=0.9663  Val F1=0.9143  (val size=178)
  Fold 2 graph: 17129 edges
Fold 2: Val Acc=0.9607  Val F1=0.9136  (val size=178)
  Fold 3 graph: 16987 edges
Fold 3: Val Acc=0.9551  Val F1=0.8889  (val size=178)
  Fold 4 graph: 17099 edges
Fold 4: Val Acc=0.9663  Val F1=0.9231  (val size=178)
  Fold 5 graph: 17089 edges
Fold 5: Val Acc=0.9548  Val F1=0.8947  (val size=177)

BotRGCN OOF F1 (all 889 users): 0.9072


In [68]:
# === OOF Integrity Checks + Utility Functions ===
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    log_loss,
    brier_score_loss,
)


def assert_oof_integrity(
    author_ids_seq,
    labels,
    xgb_probs,
    xlmr_probs,
    gnn_probs,
    xgb_folds,
    xlmr_folds,
    gnn_folds,
):
    n = len(author_ids_seq)
    assert len(set(author_ids_seq)) == n, "Duplicate author_id entries in canonical order"
    assert len(labels) == n, "Label length mismatch"

    for name, arr in [("XGB", xgb_probs), ("XLMR", xlmr_probs), ("GNN", gnn_probs)]:
        assert len(arr) == n, f"{name} probability length mismatch"
        assert np.isfinite(arr).all(), f"{name} contains non-finite probabilities"
        assert ((arr >= 0.0) & (arr <= 1.0)).all(), f"{name} contains out-of-range probabilities"

    for name, arr in [("XGB", xgb_folds), ("XLMR", xlmr_folds), ("GNN", gnn_folds)]:
        assert len(arr) == n, f"{name} fold-id length mismatch"
        assert (arr > 0).all(), f"{name} has missing fold assignments"


def best_f1_threshold(y_true, probs, grid=None):
    if grid is None:
        grid = np.linspace(0.05, 0.95, 181)
    best_t, best_f1 = 0.5, -1.0
    for t in grid:
        f1 = f1_score(y_true, (probs >= t).astype(int), average="binary")
        if f1 > best_f1:
            best_f1 = f1
            best_t = float(t)
    return best_t, float(best_f1)


def oof_metrics_at_threshold(y_true, probs, threshold):
    preds = (probs >= threshold).astype(int)
    return {
        "threshold": float(threshold),
        "f1": float(f1_score(y_true, preds, average="binary")),
        "precision": float(precision_score(y_true, preds, average="binary", zero_division=0)),
        "recall": float(recall_score(y_true, preds, average="binary", zero_division=0)),
        "accuracy": float(accuracy_score(y_true, preds)),
    }



def calibrate_probs_cv(y_true, probs, seed=RANDOM_STATE, methods=("sigmoid", "isotonic")):
    """Leakage-safe calibration via nested CV over existing OOF scores."""
    y_true = np.asarray(y_true)
    probs = np.asarray(probs)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    best = {
        "method": "raw",
        "probs": probs.copy(),
        "log_loss": float(log_loss(y_true, np.clip(probs, 1e-6, 1 - 1e-6))),
        "brier": float(brier_score_loss(y_true, probs)),
    }

    for method in methods:
        cal_probs = np.zeros_like(probs)
        for tr_idx, va_idx in inner_cv.split(np.zeros(len(y_true)), y_true):
            p_tr = np.clip(probs[tr_idx], 1e-6, 1 - 1e-6)
            y_tr = y_true[tr_idx]
            p_va = np.clip(probs[va_idx], 1e-6, 1 - 1e-6)

            if method == "sigmoid":
                # Platt scaling on a single score feature.
                cal = LogisticRegression(solver="lbfgs", max_iter=1000)
                cal.fit(p_tr.reshape(-1, 1), y_tr)
                cal_probs[va_idx] = cal.predict_proba(p_va.reshape(-1, 1))[:, 1]
            elif method == "isotonic":
                cal = IsotonicRegression(out_of_bounds="clip")
                cal.fit(p_tr, y_tr)
                cal_probs[va_idx] = cal.transform(p_va)
            else:
                raise ValueError(f"Unsupported calibration method: {method}")

        ll = float(log_loss(y_true, np.clip(cal_probs, 1e-6, 1 - 1e-6)))
        br = float(brier_score_loss(y_true, cal_probs))
        if ll < best["log_loss"]:
            best = {"method": method, "probs": cal_probs, "log_loss": ll, "brier": br}

    return best


# `assert_oof_integrity` is called after `meta_df` creation so every array
# is checked in the same canonical user order.
print("Utility functions defined.")

Utility functions defined.


## 4. Build & Train the Meta-Learner

All three models now have honest OOF predictions for **all 889 users**.
The meta-learner trains on the full dataset with nested CV for unbiased evaluation.

*Nested CV ensures we report meta-learner performance on held-out predictions; the reported F1 is a fair estimate of deployment performance.*

In [69]:
# === Build Meta-Learner on ALL users: calibration + thresholds + artifacts ===
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import (
    f1_score, accuracy_score, precision_score, recall_score,
    classification_report, confusion_matrix
)


def build_meta_features(X_raw):
    """Augment 3 base-learner probs with pairwise interactions and disagreement."""
    return np.column_stack([
        X_raw,
        X_raw[:, 0] * X_raw[:, 1],   # XGB * XLMR
        X_raw[:, 0] * X_raw[:, 2],   # XGB * GNN
        X_raw[:, 1] * X_raw[:, 2],   # XLMR * GNN
        np.max(X_raw, axis=1),        # max agreement
        np.std(X_raw, axis=1),        # disagreement signal
    ])

# Map author_id → OOF probabilities and fold ids
xgb_aid_to_prob = dict(zip(combined_df["author_id"], xgb_oof_probs))
xlmr_aid_to_prob = dict(zip(text_combined_df["author_id"], xlmr_oof_probs))
gnn_aid_to_prob = dict(zip(author_ids, gnn_oof_probs))

xgb_aid_to_fold = dict(zip(combined_df["author_id"], xgb_oof_fold))
xlmr_aid_to_fold = dict(zip(text_combined_df["author_id"], xlmr_oof_fold))
gnn_aid_to_fold = dict(zip(author_ids, gnn_oof_fold))

meta_rows = []
for aid in author_ids:
    xgb_p = xgb_aid_to_prob.get(aid)
    xlmr_p = xlmr_aid_to_prob.get(aid)
    gnn_p = gnn_aid_to_prob.get(aid)
    if xgb_p is not None and xlmr_p is not None and gnn_p is not None:
        meta_rows.append({
            "User_ID": aid,
            "XGB_Prob": float(xgb_p),
            "XLMR_Prob": float(xlmr_p),
            "GNN_Prob": float(gnn_p),
            "XGB_Fold": int(xgb_aid_to_fold[aid]),
            "XLMR_Fold": int(xlmr_aid_to_fold[aid]),
            "GNN_Fold": int(gnn_aid_to_fold[aid]),
            "True_Label": int(is_bot[author_id_to_idx[aid]]),
        })

meta_df = pd.DataFrame(meta_rows)
assert len(meta_df) == len(author_ids), "Meta DataFrame missing users due alignment issues"
assert meta_df["User_ID"].is_unique, "Meta DataFrame has duplicate users"
y_meta = meta_df["True_Label"].values

# OOF integrity check on properly aligned columns (all keyed by meta_df row order)
assert_oof_integrity(
    meta_df["User_ID"].tolist(),
    meta_df["True_Label"].values,
    meta_df["XGB_Prob"].values,
    meta_df["XLMR_Prob"].values,
    meta_df["GNN_Prob"].values,
    meta_df["XGB_Fold"].values,
    meta_df["XLMR_Fold"].values,
    meta_df["GNN_Fold"].values,
)
print("OOF integrity checks passed (aligned via meta_df).")

print(f"Meta-learner DataFrame: {len(meta_df)} users  (all OOF)")
print(f"  Bots:   {(meta_df['True_Label'] == 1).sum()}")
print(f"  Humans: {(meta_df['True_Label'] == 0).sum()}")

# --- Calibration candidates (leakage-safe CV-on-OOF) ---
calibration_results = {
    "XGB": calibrate_probs_cv(y_meta, meta_df["XGB_Prob"].values, seed=RANDOM_STATE),
    "XLMR": calibrate_probs_cv(y_meta, meta_df["XLMR_Prob"].values, seed=RANDOM_STATE),
    "GNN": calibrate_probs_cv(y_meta, meta_df["GNN_Prob"].values, seed=RANDOM_STATE),
}

meta_df["XGB_Prob_Cal"] = calibration_results["XGB"]["probs"]
meta_df["XLMR_Prob_Cal"] = calibration_results["XLMR"]["probs"]
meta_df["GNN_Prob_Cal"] = calibration_results["GNN"]["probs"]

print("\n--- Calibration selection by log-loss (CV-on-OOF) ---")
for key in ["XGB", "XLMR", "GNN"]:
    res = calibration_results[key]
    print(f"  {key:>4s}: method={res['method']:<8s}  log_loss={res['log_loss']:.5f}  brier={res['brier']:.5f}")


# --- Evaluate raw vs calibrated stack with nested CV + C tuning ---
def eval_meta_stack(X_meta, y_true, seed):
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    probs = np.zeros(len(y_true), dtype=np.float64)
    best_Cs = []
    for train_idx, val_idx in outer_cv.split(X_meta, y_true):
        inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
        grid = GridSearchCV(
            LogisticRegression(solver="lbfgs", max_iter=1000),
            param_grid={"C": [0.01, 0.1, 1.0, 10.0]},
            cv=inner_cv, scoring="f1", refit=True,
        )
        grid.fit(X_meta[train_idx], y_true[train_idx])
        probs[val_idx] = grid.predict_proba(X_meta[val_idx])[:, 1]
        best_Cs.append(grid.best_params_["C"])
    preds = (probs >= 0.5).astype(int)
    # Use the most common C across outer folds as the representative best C.
    from collections import Counter
    best_C = Counter(best_Cs).most_common(1)[0][0]
    return {
        "probs": probs,
        "preds": preds,
        "f1": float(f1_score(y_true, preds, average="binary")),
        "accuracy": float(accuracy_score(y_true, preds)),
        "precision": float(precision_score(y_true, preds, average="binary", zero_division=0)),
        "recall": float(recall_score(y_true, preds, average="binary", zero_division=0)),
        "best_C": float(best_C),
    }


X_meta_raw = build_meta_features(meta_df[["XGB_Prob", "XLMR_Prob", "GNN_Prob"]].values)
X_meta_cal = build_meta_features(meta_df[["XGB_Prob_Cal", "XLMR_Prob_Cal", "GNN_Prob_Cal"]].values)

raw_eval = eval_meta_stack(X_meta_raw, y_meta, RANDOM_STATE)
cal_eval = eval_meta_stack(X_meta_cal, y_meta, RANDOM_STATE)

best_variant_by_cv = "calibrated" if cal_eval["f1"] >= raw_eval["f1"] else "raw"

# Let nested CV decide which variant to use (calibrated if it wins or ties).
selected_variant = best_variant_by_cv
if selected_variant == "calibrated":
    X_meta = X_meta_cal
    meta_cv_probs = cal_eval["probs"]
    meta_cv_preds = cal_eval["preds"]
    stack_eval = cal_eval
else:
    X_meta = X_meta_raw
    meta_cv_probs = raw_eval["probs"]
    meta_cv_preds = raw_eval["preds"]
    stack_eval = raw_eval

print("\n--- Stack variant selection (by nested-CV F1) ---")
print(f"  raw F1={raw_eval['f1']:.4f} (C={raw_eval['best_C']}) | calibrated F1={cal_eval['f1']:.4f} (C={cal_eval['best_C']})")
print(f"  Selected variant: {selected_variant}")

# --- Threshold optimization on OOF probabilities ---
thresholds = {}
metrics_default = {}
metrics_opt = {}

base_prob_cols = [
    ("XGBoost", "XGB_Prob" if selected_variant == "raw" else "XGB_Prob_Cal"),
    ("XLM-R", "XLMR_Prob" if selected_variant == "raw" else "XLMR_Prob_Cal"),
    ("BotRGCN", "GNN_Prob" if selected_variant == "raw" else "GNN_Prob_Cal"),
]

for name, col in base_prob_cols:
    probs = meta_df[col].values
    t_opt, _ = best_f1_threshold(y_meta, probs)
    thresholds[name] = t_opt
    metrics_default[name] = oof_metrics_at_threshold(y_meta, probs, 0.5)
    metrics_opt[name] = oof_metrics_at_threshold(y_meta, probs, t_opt)

ensemble_t_opt, _ = best_f1_threshold(y_meta, meta_cv_probs)
thresholds["Ensemble"] = ensemble_t_opt
metrics_default["Ensemble"] = oof_metrics_at_threshold(y_meta, meta_cv_probs, 0.5)
metrics_opt["Ensemble"] = oof_metrics_at_threshold(y_meta, meta_cv_probs, ensemble_t_opt)

print("\n" + "=" * 72)
print("  Meta-Learner Nested CV Performance")
print("=" * 72)
print(f"  Accuracy:  {stack_eval['accuracy']:.4f}")
print(f"  F1 (bot):  {stack_eval['f1']:.4f}")
print(f"  Precision: {stack_eval['precision']:.4f}")
print(f"  Recall:    {stack_eval['recall']:.4f}")

print("\n--- Threshold comparison (0.5 vs optimized) ---")
for key in ["XGBoost", "XLM-R", "BotRGCN", "Ensemble"]:
    d = metrics_default[key]
    o = metrics_opt[key]
    print(
        f"  {key:>8s}: F1@0.5={d['f1']:.4f} | F1@opt={o['f1']:.4f} "
        f"(t={thresholds[key]:.3f})"
    )

best_individual_f1 = max(metrics_opt["XGBoost"]["f1"], metrics_opt["XLM-R"]["f1"], metrics_opt["BotRGCN"]["f1"])
ensemble_f1 = metrics_opt["Ensemble"]["f1"]
print(f"\n  Ensemble F1@opt: {ensemble_f1:.4f}")
print(f"  Best Single@opt: {best_individual_f1:.4f}")
print(f"  Improvement: {ensemble_f1 - best_individual_f1:+.4f}")

# --- Persist core artifacts ---
run_dir = ensure_run_dir(seed=RANDOM_STATE, calibration_mode=selected_variant)
meta_df_out = meta_df.copy()
meta_df_out["Ensemble_Prob_OOF"] = meta_cv_probs
meta_df_out.to_csv(run_dir / "oof_predictions.csv", index=False)

with open(run_dir / "thresholds.json", "w") as f:
    json.dump({k: float(v) for k, v in thresholds.items()}, f, indent=2)

run_config = {
    "seed": RANDOM_STATE,
    "cv_folds": K,
    "selected_variant": selected_variant,
    "best_variant_by_cv": best_variant_by_cv,
    "calibration_methods": {k: v["method"] for k, v in calibration_results.items()},
    "metrics_default": metrics_default,
    "metrics_opt": metrics_opt,
    "device": str(torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")),
}
with open(run_dir / "run_config.json", "w") as f:
    json.dump(run_config, f, indent=2)

print(f"\nSaved artifacts to: {run_dir}")

OOF integrity checks passed (aligned via meta_df).
Meta-learner DataFrame: 889 users  (all OOF)
  Bots:   184
  Humans: 705

--- Calibration selection by log-loss (CV-on-OOF) ---
   XGB: method=raw       log_loss=0.14650  brier=0.04099
  XLMR: method=isotonic  log_loss=0.12468  brier=0.02847
   GNN: method=sigmoid   log_loss=0.11789  brier=0.02942

--- Stack variant selection (by nested-CV F1) ---
  raw F1=0.9600 (C=10.0) | calibrated F1=0.9676 (C=10.0)
  Selected variant: calibrated

  Meta-Learner Nested CV Performance
  Accuracy:  0.9865
  F1 (bot):  0.9676
  Precision: 0.9624
  Recall:    0.9728

--- Threshold comparison (0.5 vs optimized) ---
   XGBoost: F1@0.5=0.8636 | F1@opt=0.8767 (t=0.340)
     XLM-R: F1@0.5=0.8950 | F1@opt=0.9117 (t=0.520)
   BotRGCN: F1@0.5=0.9185 | F1@opt=0.9185 (t=0.435)
  Ensemble: F1@0.5=0.9676 | F1@opt=0.9755 (t=0.595)

  Ensemble F1@opt: 0.9755
  Best Single@opt: 0.9185
  Improvement: +0.0570

Saved artifacts to: artifacts/ensemble/run_20260214_113408_

In [70]:
# === Final model + diagnostics ===
import pickle

# Use the best C from the winning variant's grid search
best_C = stack_eval["best_C"]
meta_model = LogisticRegression(C=best_C, solver="lbfgs", max_iter=1000)
meta_model.fit(X_meta, y_meta)

# Feature weights (8 features: 3 raw + 3 pairwise + max + std)
feature_names = ["XGBoost", "XLM-R", "BotRGCN",
                 "XGB*XLMR", "XGB*GNN", "XLMR*GNN", "Max", "Disagreement"]
w = meta_model.coef_[0]
rel = np.abs(w) / np.abs(w).sum() * 100
print(f"Meta-Learner Weights (C={best_C}, variant={selected_variant}):")
for name, wi, pct in zip(feature_names, w, rel):
    print(f"  {name:>12s}: {wi:+.3f}  ({pct:.0f}%)")
print(f"  {'Intercept':>12s}: {meta_model.intercept_[0]:+.3f}")

# Classification report (nested CV — unbiased)
print(f"\n{classification_report(y_meta, meta_cv_preds, target_names=['Human', 'Bot'])}")
cm = confusion_matrix(y_meta, meta_cv_preds)
print(f"Confusion matrix:\n{cm}")

# Save model in both legacy path and artifact run path.
with open("meta_learner.pkl", "wb") as f:
    pickle.dump(meta_model, f)
with open(run_dir / "meta_learner.pkl", "wb") as f:
    pickle.dump(meta_model, f)

with open(run_dir / "calibration_summary.json", "w") as f:
    json.dump(
        {
            key: {
                "method": val["method"],
                "log_loss": float(val["log_loss"]),
                "brier": float(val["brier"]),
            }
            for key, val in calibration_results.items()
        },
        f,
        indent=2,
    )

# Persist calibrators if calibrated variant was selected
if selected_variant == "calibrated":
    calibrator_artifacts = {}
    for key in ["XGB", "XLMR", "GNN"]:
        method = calibration_results[key]["method"]
        probs_full = meta_df[f"{key}_Prob"].values
        if method == "sigmoid":
            cal = LogisticRegression(solver="lbfgs", max_iter=1000)
            cal.fit(np.clip(probs_full, 1e-6, 1 - 1e-6).reshape(-1, 1), y_meta)
        elif method == "isotonic":
            cal = IsotonicRegression(out_of_bounds="clip")
            cal.fit(np.clip(probs_full, 1e-6, 1 - 1e-6), y_meta)
        else:
            cal = None
        calibrator_artifacts[key] = {"method": method, "calibrator": cal}
    with open(run_dir / "calibrators.pkl", "wb") as f:
        pickle.dump(calibrator_artifacts, f)
    cal_desc = ", ".join(f"{k}={v['method']}" for k, v in calibrator_artifacts.items())
    print(f"Persisted calibrators ({cal_desc})")

print(
    f"\nSaved meta_learner.pkl and calibration summary to {run_dir} | "
    f"Ensemble F1@opt={ensemble_f1:.4f} (best single={best_individual_f1:.4f}, "
    f"+{ensemble_f1 - best_individual_f1:.4f})"
)

Meta-Learner Weights (C=10.0, variant=calibrated):
       XGBoost: +5.432  (26%)
         XLM-R: +0.961  (5%)
       BotRGCN: +2.631  (12%)
      XGB*XLMR: +1.428  (7%)
       XGB*GNN: +3.349  (16%)
      XLMR*GNN: +2.972  (14%)
           Max: +3.708  (17%)
  Disagreement: +0.732  (3%)
     Intercept: -9.242

              precision    recall  f1-score   support

       Human       0.99      0.99      0.99       705
         Bot       0.96      0.97      0.97       184

    accuracy                           0.99       889
   macro avg       0.98      0.98      0.98       889
weighted avg       0.99      0.99      0.99       889

Confusion matrix:
[[698   7]
 [  5 179]]
Persisted calibrators (XGB=raw, XLMR=isotonic, GNN=sigmoid)

Saved meta_learner.pkl and calibration summary to artifacts/ensemble/run_20260214_113408_seed42_calibrated | Ensemble F1@opt=0.9755 (best single=0.9185, +0.0570)


In [71]:
# Base Learner Error Correlation Analysis
# Checks whether the three base learners make independent errors (good for stacking)
# or correlated errors (limited stacking gain).

xgb_pred = (meta_df["XGB_Prob"].values >= thresholds.get("XGBoost", 0.5)).astype(int)
xlmr_pred = (meta_df["XLMR_Prob"].values >= thresholds.get("XLM-R", 0.5)).astype(int)
gnn_pred = (meta_df["GNN_Prob"].values >= thresholds.get("BotRGCN", 0.5)).astype(int)

xgb_err = (xgb_pred != y_meta).astype(int)
xlmr_err = (xlmr_pred != y_meta).astype(int)
gnn_err = (gnn_pred != y_meta).astype(int)

err_corr = np.corrcoef(np.vstack([xgb_err, xlmr_err, gnn_err]))
print("Error correlation matrix (lower = more complementary):")
print(pd.DataFrame(err_corr, index=["XGB", "XLMR", "GNN"], columns=["XGB", "XLMR", "GNN"]).round(3))

# Agreement breakdown
all_agree = ((xgb_pred == xlmr_pred) & (xlmr_pred == gnn_pred)).sum()
any_disagree = len(y_meta) - all_agree
print(f"\nAll 3 agree: {all_agree}/{len(y_meta)} ({all_agree/len(y_meta)*100:.1f}%)")
print(f"Disagreements: {any_disagree} ({any_disagree/len(y_meta)*100:.1f}%)")

# Per-model error counts
for name, errs in [("XGB", xgb_err), ("XLMR", xlmr_err), ("GNN", gnn_err)]:
    print(f"  {name} errors: {errs.sum()}")

# Where exactly one model is wrong (ensemble can correct these)
one_wrong = (xgb_err + xlmr_err + gnn_err == 1).sum()
two_wrong = (xgb_err + xlmr_err + gnn_err == 2).sum()
all_wrong = (xgb_err + xlmr_err + gnn_err == 3).sum()
print(f"\nRecoverable (1 wrong): {one_wrong}  |  Marginal (2 wrong): {two_wrong}  |  Irrecoverable (3 wrong): {all_wrong}")

Error correlation matrix (lower = more complementary):
        XGB   XLMR    GNN
XGB   1.000  0.032 -0.024
XLMR  0.032  1.000  0.634
GNN  -0.024  0.634  1.000

All 3 agree: 798/889 (89.8%)
Disagreements: 91 (10.2%)
  XGB errors: 45
  XLMR errors: 35
  GNN errors: 39

Recoverable (1 wrong): 66  |  Marginal (2 wrong): 25  |  Irrecoverable (3 wrong): 1


In [72]:
# === Multi-seed stability report (meta-level CV + calibration/threshold selection) ===
SEED_LIST = [42, 1337, 2026]
seed_rows = []

for seed in SEED_LIST:
    xgb_cal = calibrate_probs_cv(y_meta, meta_df["XGB_Prob"].values, seed=seed)
    xlmr_cal = calibrate_probs_cv(y_meta, meta_df["XLMR_Prob"].values, seed=seed)
    gnn_cal = calibrate_probs_cv(y_meta, meta_df["GNN_Prob"].values, seed=seed)

    X_raw = build_meta_features(meta_df[["XGB_Prob", "XLMR_Prob", "GNN_Prob"]].values)
    X_cal = build_meta_features(np.column_stack([xgb_cal["probs"], xlmr_cal["probs"], gnn_cal["probs"]]))

    raw_eval_seed = eval_meta_stack(X_raw, y_meta, seed)
    cal_eval_seed = eval_meta_stack(X_cal, y_meta, seed)

    if cal_eval_seed["f1"] >= raw_eval_seed["f1"]:
        chosen = "calibrated"
        probs_seed = cal_eval_seed["probs"]
        eval_seed = cal_eval_seed
    else:
        chosen = "raw"
        probs_seed = raw_eval_seed["probs"]
        eval_seed = raw_eval_seed

    t_opt_seed, f1_opt_seed = best_f1_threshold(y_meta, probs_seed)
    seed_rows.append(
        {
            "seed": int(seed),
            "selected_variant": chosen,
            "f1_at_0_5": float(eval_seed["f1"]),
            "threshold_opt": float(t_opt_seed),
            "f1_at_opt": float(f1_opt_seed),
            "accuracy": float(eval_seed["accuracy"]),
            "precision": float(eval_seed["precision"]),
            "recall": float(eval_seed["recall"]),
        }
    )

seed_df = pd.DataFrame(seed_rows).sort_values("seed").reset_index(drop=True)
print("Multi-seed ensemble stability (meta-level):")
print(seed_df)
print(
    f"\nF1@opt mean={seed_df['f1_at_opt'].mean():.4f} "
    f"std={seed_df['f1_at_opt'].std(ddof=1):.4f}"
)

seed_summary = {
    "seeds": [int(s) for s in SEED_LIST],
    "rows": seed_rows,
    "f1_opt_mean": float(seed_df["f1_at_opt"].mean()),
    "f1_opt_std": float(seed_df["f1_at_opt"].std(ddof=1)),
    "recommended_seed": int(seed_df.sort_values(["f1_at_opt", "f1_at_0_5"], ascending=False).iloc[0]["seed"]),
    "notes": "XLM-R base checkpoints are fixed exports; this seed sweep captures meta-level CV and calibration variability.",
}

seed_df.to_csv(run_dir / "seed_summary.csv", index=False)
with open(run_dir / "seed_summary.json", "w") as f:
    json.dump(seed_summary, f, indent=2)

print(f"\nSaved seed summary artifacts to: {run_dir}")

Multi-seed ensemble stability (meta-level):
   seed selected_variant  f1_at_0_5  threshold_opt  f1_at_opt  accuracy  \
0    42       calibrated   0.967568          0.595   0.975477  0.986502   
1  1337       calibrated   0.965147          0.510   0.970350  0.985377   
2  2026       calibrated   0.967914          0.540   0.973118  0.986502   

   precision    recall  
0   0.962366  0.972826  
1   0.952381  0.978261  
2   0.952632  0.983696  

F1@opt mean=0.9730 std=0.0026

Saved seed summary artifacts to: artifacts/ensemble/run_20260214_113408_seed42_calibrated


## 5. Full-Data Training Mode (No Splits)

This section trains all available components on the entire labeled set for final competition inference.

Notes:
- This is a deployment-oriented path (not an unbiased evaluation path).
- It intentionally avoids CV/train-val splits during fitting.
- Uses the `selected_variant` (raw or calibrated) chosen by nested CV in Section 4.
- If calibrated, calibrators are persisted alongside full-data artifacts.

In [73]:
# === Full-data training/export (no splits) ===
# Trains XGB, GNN, and meta-learner on all labeled users.
# XLM-R probabilities are produced by averaging available fold checkpoints over all users.

import pickle

if "xgb" not in globals():
    import xgboost as xgb

set_global_seed(RANDOM_STATE)
run_dir_full = ensure_run_dir(seed=RANDOM_STATE, calibration_mode=f"full_data_{selected_variant}")
print(f"Full-data run dir: {run_dir_full}")

# 1) XGBoost on full labeled set
full_scale = (y_all == 0).sum() / max(y_all.sum(), 1)
xgb_full_model = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=4,
    learning_rate=0.1,
    scale_pos_weight=full_scale,
    colsample_bytree=0.5,
    eval_metric="logloss",
    random_state=RANDOM_STATE,
)
xgb_full_model.fit(X_all, y_all)
xgb_full_probs = xgb_full_model.predict_proba(X_all)[:, 1]
xgb_full_map = dict(zip(combined_df["author_id"], xgb_full_probs))

# 2) XLM-R probabilities for all users (mean over available fold checkpoints)
text_full_df = text_combined_df.drop_duplicates("author_id", keep="first").reset_index(drop=True)
text_full_ds = Dataset.from_pandas(text_full_df[["text"]].reset_index(drop=True))
text_full_ds = text_full_ds.map(tokenize, batched=True, remove_columns=["text"])
text_full_ds.set_format("torch")

available_ckpts = []
for fold in range(1, K + 1):
    try:
        ckpt = resolve_best_xlmr_checkpoint(fold)
        if ckpt not in available_ckpts:
            available_ckpts.append(ckpt)
    except Exception:
        pass

assert available_ckpts, "No XLM-R checkpoints found for full-data inference"
print(f"Using {len(available_ckpts)} XLM-R checkpoints for probability averaging")

xlmr_prob_stack = []
for ckpt in available_ckpts:
    print(f"  XLM-R infer: {ckpt}")
    mdl = AutoModelForSequenceClassification.from_pretrained(ckpt, num_labels=2).to(device)
    mdl.eval()
    logits_all = []
    with torch.no_grad():
        for batch in DataLoader(text_full_ds, batch_size=8, shuffle=False):
            batch = {k: v.to(device) for k, v in batch.items()}
            logits_all.append(mdl(**batch).logits.cpu().numpy())
    logits_all = np.concatenate(logits_all, axis=0)
    xlmr_prob_stack.append(softmax(logits_all, axis=1)[:, 1])
    del mdl
    gc.collect()
    if device.type == "mps":
        torch.mps.empty_cache()

xlmr_full_probs = np.mean(np.vstack(xlmr_prob_stack), axis=0)
xlmr_full_map = dict(zip(text_full_df["author_id"], xlmr_full_probs))

# 3) BotRGCN on full graph with all nodes in train mask
all_train_np = np.ones(num_users, dtype=bool)
ht_src, ht_tgt = knn_edges_train_targets(ht_sim, KNN_K, all_train_np)
ts_src, ts_tgt = knn_edges_train_targets(time_sim, KNN_K, all_train_np)
full_src = ht_src + ts_src + mention_src + bio_src
full_tgt = ht_tgt + ts_tgt + mention_tgt + bio_tgt
full_rel = [0] * len(ht_src) + [1] * len(ts_src) + [2] * len(mention_src) + [3] * len(bio_src)
full_edge_index = torch.tensor([full_src, full_tgt], dtype=torch.long)
full_edge_type = torch.tensor(full_rel, dtype=torch.long)

xgb_scaled_full = StandardScaler().fit_transform(xgb_feature_vectors).astype(np.float32)
node_features_full = np.concatenate([xgb_scaled_full, xlmr_feature_vectors], axis=1).astype(np.float32)
x_full = torch.from_numpy(node_features_full)

data_full = Data(
    x=x_full.to(rgcn_device),
    edge_index=full_edge_index.to(rgcn_device),
    edge_type=full_edge_type.to(rgcn_device),
    y=torch.tensor(is_bot, dtype=torch.long).to(rgcn_device),
    train_mask=torch.ones(num_users, dtype=torch.bool).to(rgcn_device),
)

gnn_full_model = BotRGCN(in_dim=feature_dim, hidden_dim=GNN_HIDDEN, out_dim=2, num_relations=NUM_RELATIONS, dropout=0.3).to(rgcn_device)
full_cw = compute_class_weights(data_full.y, data_full.train_mask, rgcn_device)
full_loss_fn = nn.CrossEntropyLoss(weight=full_cw)
full_opt = torch.optim.Adam(gnn_full_model.parameters(), lr=0.01, weight_decay=5e-4)
full_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(full_opt, T_max=GNN_EPOCHS)

for epoch in range(1, GNN_EPOCHS + 1):
    gnn_full_model.train()
    full_opt.zero_grad()
    ei, et = edge_dropout(data_full.edge_index, data_full.edge_type, 0.25)
    loss = full_loss_fn(gnn_full_model(data_full.x, ei, et)[data_full.train_mask], data_full.y[data_full.train_mask])
    loss.backward()
    full_opt.step()
    full_scheduler.step()

gnn_full_model.eval()
with torch.no_grad():
    gnn_full_logits = gnn_full_model(data_full.x, data_full.edge_index, data_full.edge_type)
    gnn_full_probs = F.softmax(gnn_full_logits, dim=1)[:, 1].cpu().numpy()
gnn_full_map = dict(zip(author_ids, gnn_full_probs))

# 4) Train meta-learner on full stacked features
full_rows = []
for aid in author_ids:
    if aid in xgb_full_map and aid in xlmr_full_map and aid in gnn_full_map:
        full_rows.append(
            {
                "User_ID": aid,
                "XGB_Prob": float(xgb_full_map[aid]),
                "XLMR_Prob": float(xlmr_full_map[aid]),
                "GNN_Prob": float(gnn_full_map[aid]),
                "True_Label": int(is_bot[author_id_to_idx[aid]]),
            }
        )

meta_full_df = pd.DataFrame(full_rows)
assert len(meta_full_df) == len(author_ids), "Full-data meta frame is misaligned"

X_meta_full = build_meta_features(meta_full_df[["XGB_Prob", "XLMR_Prob", "GNN_Prob"]].values)
y_meta_full = meta_full_df["True_Label"].values

meta_full_model = LogisticRegression(C=best_C, solver="lbfgs", max_iter=1000)
meta_full_model.fit(X_meta_full, y_meta_full)

meta_full_probs = meta_full_model.predict_proba(X_meta_full)[:, 1]
ensemble_t_opt_full, _ = best_f1_threshold(y_meta_full, meta_full_probs)

thresholds_full = {
    "XGBoost": best_f1_threshold(y_meta_full, meta_full_df["XGB_Prob"].values)[0],
    "XLM-R": best_f1_threshold(y_meta_full, meta_full_df["XLMR_Prob"].values)[0],
    "BotRGCN": best_f1_threshold(y_meta_full, meta_full_df["GNN_Prob"].values)[0],
    "Ensemble": float(ensemble_t_opt_full),
}

# 5) Save full-data artifacts
meta_full_df_out = meta_full_df.copy()
meta_full_df_out["Ensemble_Prob"] = meta_full_probs
meta_full_df_out.to_csv(run_dir_full / "full_data_predictions.csv", index=False)

with open(run_dir_full / "thresholds_full_data.json", "w") as f:
    json.dump({k: float(v) for k, v in thresholds_full.items()}, f, indent=2)

with open(run_dir_full / "meta_learner_full_data.pkl", "wb") as f:
    pickle.dump(meta_full_model, f)

with open(run_dir_full / "xgb_full_data_model.pkl", "wb") as f:
    pickle.dump(xgb_full_model, f)

# Save GNN weights
torch.save(gnn_full_model.state_dict(), run_dir_full / "botrgcn_full_data.pth")

# Copy calibrators to full-data run dir if using calibrated variant
if selected_variant == "calibrated" and "calibrator_artifacts" in globals():
    with open(run_dir_full / "calibrators.pkl", "wb") as f:
        pickle.dump(calibrator_artifacts, f)

full_report = {
    "mode": "full_data_no_splits",
    "n_users": int(len(meta_full_df)),
    "ensemble_f1_train_opt": float(f1_score(y_meta_full, (meta_full_probs >= ensemble_t_opt_full).astype(int), average="binary")),
    "ensemble_f1_train_at_0_5": float(f1_score(y_meta_full, (meta_full_probs >= 0.5).astype(int), average="binary")),
    "ensemble_threshold": float(ensemble_t_opt_full),
    "notes": "Training-set metrics are optimistic; use for deployment artifact generation only.",
}

with open(run_dir_full / "full_data_run_report.json", "w") as f:
    json.dump(full_report, f, indent=2)

print("\nFull-data training complete.")
print(f"Users: {len(meta_full_df)}")
print(f"Ensemble train F1@0.5: {full_report['ensemble_f1_train_at_0_5']:.4f}")
print(f"Ensemble train F1@opt: {full_report['ensemble_f1_train_opt']:.4f} at t={ensemble_t_opt_full:.3f}")
print(f"Artifacts: {run_dir_full}")

Full-data run dir: artifacts/ensemble/run_20260214_113409_seed42_full_data_calibrated


Map: 100%|██████████| 889/889 [00:00<00:00, 1897.46 examples/s]


Using 5 XLM-R checkpoints for probability averaging
  XLM-R infer: xlmr_cv/fold1_phase3/checkpoint-90


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1614.32it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              


  XLM-R infer: xlmr_cv/fold2_phase3/checkpoint-90


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1411.20it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              


  XLM-R infer: xlmr_cv/fold3_phase3/checkpoint-135


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1514.44it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              


  XLM-R infer: xlmr_cv/fold4_phase3/checkpoint-90


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1559.11it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              


  XLM-R infer: xlmr_cv/fold5_phase3/checkpoint-90


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1495.67it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              



Full-data training complete.
Users: 889
Ensemble train F1@0.5: 1.0000
Ensemble train F1@opt: 1.0000 at t=0.050
Artifacts: artifacts/ensemble/run_20260214_113409_seed42_full_data_calibrated


In [75]:

NEW_JSON_PATH = Path("dataset.posts&users.34.json")
OUTPUT_CSV = Path("new_data_predictions.csv")


def _latest_full_data_run_dir(base_dir=ARTIFACTS_BASE_DIR):
    if not base_dir.exists():
        return None
    cands = [p for p in base_dir.glob("run_*_full_data_*") if p.is_dir()]
    if not cands:
        # fallback: any run dir that has full-data artifacts
        cands = [p for p in base_dir.glob("run_*") if (p / "thresholds_full_data.json").exists()]
    if not cands:
        return None
    return sorted(cands)[-1]


def _build_user_text_df_from_json(data_json):
    by_author = defaultdict(list)
    for p in data_json["posts"]:
        by_author[p["author_id"]].append(preprocess_text(p.get("text", "") or ""))
    rows = []
    for aid, texts in by_author.items():
        text = " [SEP] ".join(t for t in texts if t)
        rows.append({"author_id": aid, "text": text if text else ""})
    return pd.DataFrame(rows)


assert NEW_JSON_PATH.exists(), f"Missing file: {NEW_JSON_PATH}"

# Load full-data artifacts if models are not already in memory.
latest_full_dir = _latest_full_data_run_dir()
if "meta_full_model" not in globals():
    assert latest_full_dir is not None, "No full-data run directory found under artifacts/ensemble"
    import pickle as _pickle
    with open(latest_full_dir / "meta_learner_full_data.pkl", "rb") as f:
        meta_full_model = _pickle.load(f)

if "xgb_full_model" not in globals():
    assert latest_full_dir is not None, "No full-data run directory found under artifacts/ensemble"
    import pickle as _pickle
    with open(latest_full_dir / "xgb_full_data_model.pkl", "rb") as f:
        xgb_full_model = _pickle.load(f)

if "thresholds_full" not in globals():
    if latest_full_dir is None:
        latest_full_dir = _latest_full_data_run_dir()
    assert latest_full_dir is not None, "No full-data run directory found under artifacts/ensemble"
    with open(latest_full_dir / "thresholds_full_data.json") as f:
        thresholds_full = json.load(f)

if "gnn_full_model" not in globals():
    if latest_full_dir is None:
        latest_full_dir = _latest_full_data_run_dir()
    assert latest_full_dir is not None, "No full-data run directory found under artifacts/ensemble"
    gnn_full_model = BotRGCN(in_dim=feature_dim, hidden_dim=GNN_HIDDEN, out_dim=2, num_relations=NUM_RELATIONS, dropout=0.3).to(rgcn_device)
    gnn_full_model.load_state_dict(torch.load(latest_full_dir / "botrgcn_full_data.pth", map_location=rgcn_device))
    gnn_full_model.eval()

# Parse new JSON + metadata features
with open(NEW_JSON_PATH) as f:
    new_data = json.load(f)

new_posts = new_data["posts"]
new_users = new_data["users"]
new_lang = new_data.get("lang", "en")

new_df = extract_user_features(new_posts, new_users, bot_ids=set())
new_df["levenshtein_name_dist"] = new_df["author_id"].map(
    {u["id"]: levenshtein_distance((u.get("username", "") or "").lower(), (u.get("name", "") or "").lower()) for u in new_users}
)
new_df["digit_density"] = new_df["author_id"].map(
    {u["id"]: digit_density(u.get("username", "")) for u in new_users}
)

new_by_author_ts = defaultdict(list)
for p in new_posts:
    new_by_author_ts[p["author_id"]].append(pd.to_datetime(p["created_at"]))
new_df["iat_entropy"] = new_df["author_id"].map({aid: iat_entropy(ts) for aid, ts in new_by_author_ts.items()})
new_df["burstiness"] = new_df["author_id"].map({aid: burstiness(ts) for aid, ts in new_by_author_ts.items()})

new_act = pd.DataFrame.from_dict(
    {aid: activity_vector(ts) for aid, ts in new_by_author_ts.items()},
    orient="index",
    columns=[f"hour_{h}" for h in range(24)],
)
new_act.index.name = "author_id"
new_act = new_act.reset_index()
new_df = new_df.merge(new_act, on="author_id", how="left")

new_by_author_posts = defaultdict(list)
for p in new_posts:
    new_by_author_posts[p["author_id"]].append(p)
new_df["lang_mismatch_ratio"] = new_df["author_id"].map(
    {aid: lang_mismatch_ratio(ap, new_lang) for aid, ap in new_by_author_posts.items()}
)
new_df["screen_name_entropy"] = new_df["author_id"].map(
    {u["id"]: screen_name_entropy(u.get("username", "")) for u in new_users}
)

for c in feature_cols:
    if c not in new_df.columns:
        new_df[c] = 0.0
new_df = new_df[["author_id"] + feature_cols].copy()
new_df[feature_cols] = new_df[feature_cols].fillna(0)

xgb_new_probs = xgb_full_model.predict_proba(new_df[feature_cols])[:, 1]
new_author_ids = new_df["author_id"].tolist()

# XLM-R probabilities + embeddings for new users
new_text_df = _build_user_text_df_from_json(new_data)
new_text_df = new_text_df.drop_duplicates("author_id", keep="first").reset_index(drop=True)

# Ensure all metadata users have a text row
missing_text_ids = set(new_author_ids) - set(new_text_df["author_id"])
if missing_text_ids:
    new_text_df = pd.concat(
        [
            new_text_df,
            pd.DataFrame({"author_id": list(missing_text_ids), "text": ["" for _ in missing_text_ids]}),
        ],
        ignore_index=True,
    )

new_text_df = new_text_df.set_index("author_id").loc[new_author_ids].reset_index()

new_text_ds = Dataset.from_pandas(new_text_df[["text"]].reset_index(drop=True))
new_text_ds = new_text_ds.map(tokenize, batched=True, remove_columns=["text"])
new_text_ds.set_format("torch")

ckpts = []
for fold in range(1, K + 1):
    try:
        ckpt = resolve_best_xlmr_checkpoint(fold)
        if ckpt not in ckpts:
            ckpts.append(ckpt)
    except Exception:
        pass
assert ckpts, "No XLM-R checkpoints available"

xlmr_prob_list = []
for ckpt in ckpts:
    mdl = AutoModelForSequenceClassification.from_pretrained(ckpt, num_labels=2).to(device)
    mdl.eval()
    logits_chunks = []
    with torch.no_grad():
        for batch in DataLoader(new_text_ds, batch_size=8, shuffle=False):
            batch = {k: v.to(device) for k, v in batch.items()}
            logits_chunks.append(mdl(**batch).logits.cpu().numpy())
    logits_all = np.concatenate(logits_chunks, axis=0)
    xlmr_prob_list.append(softmax(logits_all, axis=1)[:, 1])
    del mdl
    gc.collect()
    if device.type == "mps":
        torch.mps.empty_cache()

xlmr_new_probs = np.mean(np.vstack(xlmr_prob_list), axis=0)

# Embeddings for GNN features: use first checkpoint's encoder CLS.
emb_model = AutoModelForSequenceClassification.from_pretrained(ckpts[0], num_labels=2).to(device)
emb_model.eval()
new_xlmr_embs = []
with torch.no_grad():
    for batch in DataLoader(new_text_ds, batch_size=8, shuffle=False):
        out = emb_model.roberta(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
        )
        new_xlmr_embs.append(out.last_hidden_state[:, 0, :].cpu().numpy())
new_xlmr_emb = np.vstack(new_xlmr_embs).astype(np.float32)
del emb_model
if device.type == "mps":
    torch.mps.empty_cache()

# GNN probabilities for new users via augmented graph inference
old_ids = list(author_ids)
combined_ids = old_ids + new_author_ids
combined_idx = {aid: i for i, aid in enumerate(combined_ids)}

# Scale metadata using train-distribution scaler (fit on existing known users only).
xgb_scaler_full = StandardScaler().fit(xgb_feature_vectors)
old_xgb_scaled = xgb_scaler_full.transform(xgb_feature_vectors).astype(np.float32)
new_xgb_raw = new_df[feature_cols].values.astype(np.float32)
new_xgb_scaled = xgb_scaler_full.transform(new_xgb_raw).astype(np.float32)

combined_xgb = np.vstack([old_xgb_scaled, new_xgb_scaled]).astype(np.float32)
combined_xlmr = np.vstack([xlmr_feature_vectors, new_xlmr_emb]).astype(np.float32)
combined_x = torch.from_numpy(np.concatenate([combined_xgb, combined_xlmr], axis=1).astype(np.float32))

# Rebuild hashtag/hour similarity for combined graph.
combined_hashtags = {aid: list(user_hashtags.get(aid, [])) for aid in old_ids}
combined_hours = {aid: list(user_hours.get(aid, [])) for aid in old_ids}
for aid in new_author_ids:
    combined_hashtags.setdefault(aid, [])
    combined_hours.setdefault(aid, [])

for p in new_posts:
    aid = p["author_id"]
    if aid not in combined_idx:
        continue
    text = p.get("text", "") or ""
    tags = re.findall(r"#([A-Za-z0-9_]+)", text)
    combined_hashtags[aid].extend(t.lower() for t in tags)
    try:
        combined_hours[aid].append(pd.to_datetime(p["created_at"]).hour)
    except Exception:
        pass

ht_docs_combined = [" ".join(combined_hashtags[aid]) if combined_hashtags[aid] else "" for aid in combined_ids]
ht_sim_combined = cosine_similarity(TfidfVectorizer(token_pattern=r"[a-z0-9_]+").fit_transform(ht_docs_combined))

hour_hist_combined = np.zeros((len(combined_ids), 24), dtype=np.float32)
for i, aid in enumerate(combined_ids):
    for h in combined_hours[aid]:
        hour_hist_combined[i, h] += 1
norms = np.linalg.norm(hour_hist_combined, axis=1, keepdims=True)
norms[norms == 0] = 1
time_sim_combined = (hour_hist_combined / norms) @ (hour_hist_combined / norms).T

all_true_mask = np.ones(len(combined_ids), dtype=bool)
ht_src_c, ht_tgt_c = knn_edges_train_targets(ht_sim_combined, KNN_K, all_true_mask)
ts_src_c, ts_tgt_c = knn_edges_train_targets(time_sim_combined, KNN_K, all_true_mask)

# Explicit edges: keep existing + add new posts/user bios edges.
combined_username_to_id = dict(username_to_id)
for u in new_users:
    uname = (u.get("username", "") or "").lower()
    if uname:
        combined_username_to_id[uname] = u["id"]

mention_src_c = list(mention_src)
mention_tgt_c = list(mention_tgt)
bio_src_c = list(bio_src)
bio_tgt_c = list(bio_tgt)

for p in new_posts:
    sid = p["author_id"]
    if sid not in combined_idx:
        continue
    for m in re.findall(r"@([A-Za-z0-9_]+)", p.get("text", "") or ""):
        tid = combined_username_to_id.get(m.lower())
        if tid and tid in combined_idx and tid != sid:
            mention_src_c.append(combined_idx[sid])
            mention_tgt_c.append(combined_idx[tid])

for u in new_users:
    sid = u["id"]
    if sid not in combined_idx:
        continue
    for m in re.findall(r"@([A-Za-z0-9_]+)", u.get("description", "") or ""):
        tid = combined_username_to_id.get(m.lower())
        if tid and tid in combined_idx and tid != sid:
            bio_src_c.append(combined_idx[sid])
            bio_tgt_c.append(combined_idx[tid])

edge_src_combined = ht_src_c + ts_src_c + mention_src_c + bio_src_c
edge_tgt_combined = ht_tgt_c + ts_tgt_c + mention_tgt_c + bio_tgt_c
edge_rel_combined = [0] * len(ht_src_c) + [1] * len(ts_src_c) + [2] * len(mention_src_c) + [3] * len(bio_src_c)

edge_index_combined = torch.tensor([edge_src_combined, edge_tgt_combined], dtype=torch.long)
edge_type_combined = torch.tensor(edge_rel_combined, dtype=torch.long)

infer_data = Data(
    x=combined_x.to(rgcn_device),
    edge_index=edge_index_combined.to(rgcn_device),
    edge_type=edge_type_combined.to(rgcn_device),
)

gnn_full_model.eval()
with torch.no_grad():
    logits_combined = gnn_full_model(infer_data.x, infer_data.edge_index, infer_data.edge_type)
    probs_combined = F.softmax(logits_combined, dim=1)[:, 1].cpu().numpy()

gnn_new_probs = probs_combined[len(old_ids):]

# Ensemble prediction (apply calibrators if selected_variant == "calibrated")

xgb_new_p = xgb_new_probs.copy()
xlmr_new_p = xlmr_new_probs.copy()
gnn_new_p = gnn_new_probs.copy()

# Load and apply calibrators if using calibrated variant
if selected_variant == "calibrated":
    if "calibrator_artifacts" not in globals():
        cal_path = (latest_full_dir or _latest_full_data_run_dir())
        if cal_path is not None and (cal_path / "calibrators.pkl").exists():
            import pickle as _pickle
            with open(cal_path / "calibrators.pkl", "rb") as f:
                calibrator_artifacts = _pickle.load(f)
        elif (run_dir / "calibrators.pkl").exists():
            import pickle as _pickle
            with open(run_dir / "calibrators.pkl", "rb") as f:
                calibrator_artifacts = _pickle.load(f)
    if "calibrator_artifacts" in globals():
        def _apply_calibrator(probs_arr, key):
            cal_obj = calibrator_artifacts[key]["calibrator"]
            method = calibrator_artifacts[key]["method"]
            if cal_obj is None:
                return probs_arr
            raw = np.clip(probs_arr, 1e-6, 1 - 1e-6)
            if method == "sigmoid":
                return cal_obj.predict_proba(raw.reshape(-1, 1))[:, 1]
            elif method == "isotonic":
                return cal_obj.transform(raw)
            return probs_arr
        xgb_new_p = _apply_calibrator(xgb_new_p, "XGB")
        xlmr_new_p = _apply_calibrator(xlmr_new_p, "XLMR")
        gnn_new_p = _apply_calibrator(gnn_new_p, "GNN")
        cal_desc = ", ".join(f"{k}={v['method']}" for k, v in calibrator_artifacts.items())
        print(f"Applied calibrators: {cal_desc}")

X_new_meta = build_meta_features(np.column_stack([xgb_new_p, xlmr_new_p, gnn_new_p]))
p_bot = meta_full_model.predict_proba(X_new_meta)[:, 1]
ens_threshold = float(thresholds_full.get("Ensemble", 0.5))
pred_label = (p_bot >= ens_threshold).astype(int)

pred_df = pd.DataFrame(
    {
        "author_id": new_author_ids,
        "XGB_Prob": xgb_new_probs,
        "XLMR_Prob": xlmr_new_probs,
        "GNN_Prob": gnn_new_probs,
        "Ensemble_Prob": p_bot,
        "Prediction": pred_label,
    }
)

pred_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved predictions to {OUTPUT_CSV}")
print(pred_df.head())

Map: 100%|██████████| 438/438 [00:00<00:00, 1478.57 examples/s]
Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1646.62it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1365.30it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1596.17it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Loading weights: 100%|██████████| 201/201 [00:00<00:00, 972.04it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1417.52it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1636.74it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              


Applied calibrators: XGB=raw, XLMR=isotonic, GNN=sigmoid
Saved predictions to new_data_predictions.csv
                              author_id  XGB_Prob  XLMR_Prob      GNN_Prob  \
0  d09db3e1-ba07-9615-9aed-e04de2ad81a7  0.048572   0.010915  6.818239e-05   
1  e9520b4f-aac6-a766-9d6d-fb17b53b77f2  0.002735   0.042170  1.776218e-06   
2  1ac40bd9-d485-a48c-94d9-97bfaa058d2c  0.009827   0.000373  6.207617e-11   
3  3c7167d6-0c94-a62e-92a2-10c95e2ee098  0.026218   0.000659  9.519794e-09   
4  8775797d-b1ea-a5bf-83dc-0fcdbea36ef3  0.001667   0.001319  3.762298e-10   

   Ensemble_Prob  Prediction  
0       0.000490           0  
1       0.000531           0  
2       0.000334           0  
3       0.000356           0  
4       0.000326           0  


In [None]:
# === Export predicted bot IDs (competition submission format) ===
# Reads the predictions CSV and writes one bot ID per line, matching dataset.bots.*.txt format.

PREDICTIONS_CSV = Path("new_data_predictions.csv")
OUTPUT_BOTS_TXT = Path("predicted_bots.txt")

pred_df = pd.read_csv(PREDICTIONS_CSV)
bot_ids = pred_df.loc[pred_df["Prediction"] == 1, "author_id"].tolist()

with open(OUTPUT_BOTS_TXT, "w") as f:
    for bid in bot_ids:
        f.write(f"{bid}\n")

print(f"Wrote {len(bot_ids)} predicted bot IDs to {OUTPUT_BOTS_TXT}")
print(f"Total users: {len(pred_df)}  |  Bots: {len(bot_ids)}  |  Humans: {len(pred_df) - len(bot_ids)}")
print(f"\nFirst 10 bot IDs:")
for bid in bot_ids[:10]:
    print(f"  {bid}")