
# TwiBot-22 — MiniLM-L12-v2 embeddings + XGBoost

In [None]:

import os, re, json, glob, math, time
from datetime import datetime, timezone
from dateutil import parser as dtparser

import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier

from tqdm.auto import tqdm

DATA_DIR = "./data_processed" 
TWEETS_GLOB = os.path.join(DATA_DIR, "tweet_*_processed.json")
USER_JSON = os.path.join(DATA_DIR, "user.json")

BATCH_SIZE = 16
MAX_LENGTH = 512 
if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"
print(f"Using device: {DEVICE}")
RANDOM_STATE = 42
N_TWEETS_PER_USER = 20  

def log(msg: str):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}")

In [None]:

def load_json_any(path):
    with open(path, 'r', encoding='utf-8') as f:
        txt = f.read().strip()
    try:
        obj = json.loads(txt)
        return obj
    except json.JSONDecodeError:
        rows = []
        for line in txt.splitlines():
            line=line.strip()
            if not line: 
                continue
            rows.append(json.loads(line))
        return rows

def tweets_to_df(tweet_obj):
    if isinstance(tweet_obj, dict):
        items = []
        for tw_id, tw in tweet_obj.items():
            rec = {'tweet_id': tw_id}
            rec.update(tw)
            items.append(rec)
        return pd.DataFrame(items)
    elif isinstance(tweet_obj, list):
        return pd.DataFrame(tweet_obj)
    else:
        raise ValueError("Formato tweet JSON non riconosciuto.")

def users_to_df(user_obj):
    if isinstance(user_obj, dict):
        items = []
        for uid, u in user_obj.items():
            if isinstance(u, dict):
                rec = {'_key': uid}
                rec.update(u)
                items.append(rec)
        if items:
            return pd.DataFrame(items)
        else:
            return pd.DataFrame([user_obj])
    elif isinstance(user_obj, list):
        return pd.DataFrame(user_obj)
    else:
        return pd.DataFrame([user_obj])

def extract_numeric_from_user_id(uid):
    if pd.isna(uid):
        return np.nan
    m = re.search(r'(\d+)', str(uid))
    return int(m.group(1)) if m else np.nan


In [None]:
import os, glob, time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import ijson
from datetime import datetime, timezone
from dateutil import parser as dtparser
import ijson 
import re 
from collections import defaultdict

tqdm.pandas()

user_obj = load_json_any(USER_JSON)
users = users_to_df(user_obj)
log(f"Utenti caricati: {len(users)}")

if 'id' not in users.columns:
    if '_key' in users.columns:
        users['id'] = users['_key']
    else:
        users['id'] = None

users['user_id_str'] = users['id'].astype(str)
users['user_id_num'] = users['user_id_str'].apply(extract_numeric_from_user_id)

if 'public_metrics' in users.columns:
    pm = users['public_metrics'].apply(lambda x: x if isinstance(x, dict) else {})
    pm_df = pd.json_normalize(pm)
    before_cols = set(users.columns)
    pm_df.columns = [f'pm.{c}' for c in pm_df.columns]
    users = pd.concat([users.drop(columns=['public_metrics']), pm_df], axis=1)
    added = set(users.columns) - before_cols

def parse_dt_safe(x):
    if pd.isna(x): return pd.NaT
    try:
        return dtparser.parse(str(x))
    except Exception:
        return pd.NaT

if 'created_at' in users.columns:
    users['created_at_dt'] = users['created_at'].progress_apply(parse_dt_safe)
    now = datetime.now(timezone.utc)
    users['account_age_days'] = (now - users['created_at_dt']).dt.days
    n_na = users['created_at_dt'].isna().sum()
else:
    users['account_age_days'] = np.nan
    log("account_age_days=NaN")

def is_default_profile_img(url):
    if not isinstance(url, str) or not url:
        return True 
    return ('default_profile_images' in url) or ('default_profile' in url)

if 'profile_image_url' in users.columns:
    users['default_profile_image'] = users['profile_image_url'].progress_apply(is_default_profile_img)
else:
    log("default_profile_image=NaN")
    users['default_profile_image'] = pd.Series([np.nan]*len(users))


tweet_files = sorted(glob.glob(TWEETS_GLOB))
user_tweet_counts = defaultdict(int)
filtered_tweets_data = []

required_cols = [
    'author_id', 'text', 'like_count', 
    'retweet_count', 'reply_count', 'quote_count'
]

for fp in tqdm(tweet_files, desc="Caricamento e filtro tweet"):
    log(f"Processo: {os.path.basename(fp)}")
    try:
        with open(fp, 'r', encoding='utf-8') as f:
            parser = ijson.kvitems(f, '') 
            for tw_id, tw_data in tqdm(parser, desc=f"Oggetti {os.path.basename(fp)}", leave=False):
                if not isinstance(tw_data, dict): 
                    continue
                author_id = tw_data.get('author_id')
                if not author_id or not isinstance(author_id, (int, float)):
                    if isinstance(author_id, str):
                        m = re.search(r'(\d+)', str(author_id))
                        author_id = int(m.group(1)) if m else None
                    
                    if not author_id:
                        continue
                author_id_int = int(author_id)
                if user_tweet_counts[author_id_int] < N_TWEETS_PER_USER:
                    user_tweet_counts[author_id_int] += 1
                    record = {col: tw_data.get(col) for col in required_cols}
                    record['author_id'] = author_id_int 
                    filtered_tweets_data.append(record)

    except Exception as e:
        if "IncompleteJSONError" in str(e) or "JSONError" in str(e):
             log(f"file {fp} corrotto")


tweets = pd.DataFrame(filtered_tweets_data)
del filtered_tweets_data 
log(f"Tweet totali: {len(tweets)}")

base_cols = ['author_id', 'text', 'like_count', 'retweet_count', 'reply_count', 'quote_count']
for col in base_cols:
    if col not in tweets.columns:
        tweets[col] = np.nan
        log(f"{col}'=NaN")

print(f"Utenti: {len(users)}  |  Tweet (filtrati): {len(tweets)}")
users.head(2), tweets.head(2)

In [None]:
import time
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

user_texts = defaultdict(list)
user_stats = defaultdict(lambda: {
    'like_sum': 0.0, 
    'retweet_sum': 0.0, 
    'reply_sum': 0.0, 
    'quote_sum': 0.0, 
    'count': 0,
    'unique_texts': set() 
})


for row in tqdm(tweets.itertuples(index=False), total=len(tweets), desc="Aggregazione manuale"):
    try:
        author_id = int(row.author_id) 
        text = str(row.text) if pd.notna(row.text) else ""
        
        user_texts[author_id].append(text)
        
        stats = user_stats[author_id]
        stats['count'] += 1
        stats['unique_texts'].add(text)
        
        like_val = pd.to_numeric(row.like_count, errors='coerce')
        retweet_val = pd.to_numeric(row.retweet_count, errors='coerce')
        reply_val = pd.to_numeric(row.reply_count, errors='coerce')
        quote_val = pd.to_numeric(row.quote_count, errors='coerce')

        stats['like_sum'] += 0.0 if pd.isna(like_val) else like_val
        stats['retweet_sum'] += 0.0 if pd.isna(retweet_val) else retweet_val
        stats['reply_sum'] += 0.0 if pd.isna(reply_val) else reply_val
        stats['quote_sum'] += 0.0 if pd.isna(quote_val) else quote_val
        
    except (TypeError, ValueError) as e:
        continue

del tweets
agg_text_data = []
for author_id, texts in tqdm(user_texts.items(), desc="Salvataggio liste testi"):
    agg_text_data.append({'author_id': author_id, 'tweets_list': texts})
agg_text = pd.DataFrame(agg_text_data)
del user_texts, agg_text_data 

eng_data = []
for author_id, stats in tqdm(user_stats.items(), desc="Calcolo medie"):
    count = stats['count']
    if count == 0: continue
        
    avg_like = stats['like_sum'] / count
    avg_retweet = stats['retweet_sum'] / count
    avg_reply = stats['reply_sum'] / count
    avg_quote = stats['quote_sum'] / count
    unique_ratio = len(stats['unique_texts']) / count
    
    eng_data.append({
        'author_id': author_id,
        'avg_like': avg_like,
        'avg_retweet': avg_retweet,
        'avg_reply': avg_reply,
        'avg_quote': avg_quote,
        'n_tweets': count,
        'unique_text_ratio': unique_ratio
    })
eng = pd.DataFrame(eng_data)
del user_stats, eng_data


users_j = users.copy()
users_j['user_id_num'] = pd.to_numeric(users_j['user_id_num'], errors='coerce')
agg = users_j.merge(agg_text, left_on='user_id_num', right_on='author_id', how='left')\
             .merge(eng, left_on='user_id_num', right_on='author_id', how='left', suffixes=('','_eng'))

for c in ['author_id_x','author_id_y']:
    if c in agg.columns:
        agg = agg.drop(columns=[c])

for c in ['avg_like','avg_retweet','avg_reply','avg_quote','n_tweets','unique_text_ratio']:
    if c in agg.columns:
        agg[c] = agg[c].fillna(0.0)

if 'description' not in agg.columns:
    agg['description'] = ""
    
print(f"Righe finali: {len(agg)}")
agg.head(2)

In [None]:
def normalize_label(x):
    if x is None or (isinstance(x, float) and np.isnan(x)): 
        return np.nan
    xs = str(x).strip().lower()
    if xs in {'bot','1','true','yes','fake','ai','automated'}:
        return 1
    if xs in {'human','0','false','no','real','genuine'}:
        return 0
    return np.nan

labels = None

lbl_csv_path = os.path.join(DATA_DIR, 'label.csv')
if os.path.exists(lbl_csv_path):
    try:
        tmp = pd.read_csv(lbl_csv_path, header=None, names=['user_id_str', 'label'])
        tmp['y'] = tmp['label'].apply(normalize_label)
        labels = tmp[['user_id_str','y']]
        log(f"Caricati {len(labels)} record")
    except Exception as e:
        raise e


if labels is None and 'label' in users.columns:
    tmp = users[['user_id_str','label']].copy()
    tmp['y'] = tmp['label'].apply(normalize_label)
    labels = tmp[['user_id_str','y']]

elif labels is None:
    lbl_path = os.path.join(DATA_DIR, 'label.json')
    if os.path.exists(lbl_path):
        lbl_obj = load_json_any(lbl_path)
        if isinstance(lbl_obj, dict):
            tmp = pd.DataFrame({'user_id_str': list(lbl_obj.keys()), 'label': list(lbl_obj.values())})
        elif isinstance(lbl_obj, list):
            tmp = pd.DataFrame(lbl_obj)
            if 'user_id' in tmp.columns and 'label' in tmp.columns:
                tmp = tmp.rename(columns={'user_id':'user_id_str'})
        else:
            tmp = pd.DataFrame()
        if len(tmp):
            tmp['y'] = tmp['label'].apply(normalize_label)
            labels = tmp[['user_id_str','y']]

if labels is None:
    gt_path = os.path.join(DATA_DIR, 'ground_truth.json')
    if os.path.exists(gt_path):
        gt_obj = load_json_any(gt_path)
        if isinstance(gt_obj, dict):
            tmp = pd.DataFrame({'user_id_str': list(gt_obj.keys()), 'label': list(gt_obj.values())})
            tmp['y'] = tmp['label'].apply(normalize_label)
            labels = tmp[['user_id_str','y']]

if labels is not None:
    data = agg.merge(labels, on='user_id_str', how='inner')
    data = data[~data['y'].isna()].reset_index(drop=True)
    
    if len(data) > 0:
        print(data['y'].value_counts(normalize=True).to_string())
else:
    data = agg.copy()
    
data.head(2)

In [None]:
import os, time
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
EMBEDDING_DIM = 384

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
    model.eval()

except Exception as e:
    raise e


def get_bert_embeddings(texts: list, batch_size: int) -> np.ndarray:
    
    all_embeddings = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i : i + batch_size]
            
            inputs = tokenizer(
                batch_texts, 
                return_tensors="pt",
                padding=True, 
                truncation=True, 
                max_length=MAX_LENGTH
            )
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            outputs = model(**inputs)
            hidden_states = outputs.last_hidden_state
            mask = inputs['attention_mask']
            mask_expanded = mask.unsqueeze(-1).expand(hidden_states.size())
            masked_outputs = hidden_states * mask_expanded
            sum_embeddings = torch.sum(masked_outputs, 1)
            count_safe = torch.clamp(mask.sum(1, keepdim=True), min=1e-9)
            mean_embeddings = sum_embeddings / count_safe
            mean_embeddings = torch.nn.functional.normalize(mean_embeddings, p=2, dim=1)
            all_embeddings.append(mean_embeddings.cpu().numpy())
    
    if not all_embeddings:
        return np.array([]).reshape(0, EMBEDDING_DIM)
        
    return np.vstack(all_embeddings)


In [None]:

import os

BIO_EMBEDDINGS_PATH = os.path.join(DATA_DIR, "precalc_bio_embeddings.parquet")


if os.path.exists(BIO_EMBEDDINGS_PATH):
    try:
        bio_df = pd.read_parquet(BIO_EMBEDDINGS_PATH)
        
        if bio_df.shape[0] == len(data) and bio_df.shape[1] == EMBEDDING_DIM:
            bio_df.index = data.index
        else:
            bio_df = None
            
    except Exception as e:
        bio_df = None
else:
    log("Nessun file precalcolato trovato")
    bio_df = None

if bio_df is None:
    bio_texts = data['description'].fillna("").tolist()
    bio_embeddings = get_bert_embeddings(bio_texts, BATCH_SIZE)

    log(f"Shape: {bio_embeddings.shape}")

    bio_df = pd.DataFrame(
        bio_embeddings, 
        index=data.index, 
        columns=[f'bio_e_{i}' for i in range(EMBEDDING_DIM)]
    )
    
    try:
        log(f"Salvataggio embedding in {BIO_EMBEDDINGS_PATH}...")
        bio_df.to_parquet(BIO_EMBEDDINGS_PATH, index=True)
    except Exception as e:
        raise e


In [None]:
import os 
import numpy as np

TWEET_EMBEDDINGS_PATH = os.path.join(DATA_DIR, "precalc_tweet_avg_embeddings.csv")


if os.path.exists(TWEET_EMBEDDINGS_PATH):
    try:
        tweet_df = pd.read_csv(TWEET_EMBEDDINGS_PATH)
        tweet_df = tweet_df.set_index('original_index')
        
        if tweet_df.shape[0] == len(data) and tweet_df.shape[1] == EMBEDDING_DIM:
            tweet_df.index = data.index
        else:
            tweet_df = None 
            
    except Exception as e:
        tweet_df = None
else:
    log("Nessun file trovato")
    tweet_df = None

if tweet_df is None:
    
    if os.path.exists(TWEET_EMBEDDINGS_PATH):
        try:
            os.remove(TWEET_EMBEDDINGS_PATH)
        except Exception as e:
            raise e

    col_names = ['original_index'] + [f'tweet_e_{i}' for i in range(EMBEDDING_DIM)]
    try:
        pd.DataFrame(columns=col_names).to_csv(TWEET_EMBEDDINGS_PATH, index=False)
    except Exception as e:
        raise e

    for idx, row in tqdm(data.iterrows(), total=len(data), desc="Embedding e Salvataggio 1-per-1"):
        tweet_list = row.get('tweets_list')
        
        texts_to_embed = []
        if isinstance(tweet_list, list):
            for t in tweet_list:
                if pd.notna(t) and str(t).strip():
                    texts_to_embed.append(str(t))

        if not texts_to_embed:
            avg_embedding = np.zeros(EMBEDDING_DIM)
        else:
            try:
                user_embeddings = get_bert_embeddings(texts_to_embed, BATCH_SIZE)
                
                if user_embeddings.shape[0] > 0:
                    avg_embedding = np.mean(user_embeddings, axis=0)
                else:
                    avg_embedding = np.zeros(EMBEDDING_DIM)
            except Exception as e:
                avg_embedding = np.zeros(EMBEDDING_DIM)
                raise e
                
        
        record = {f'tweet_e_{i}': val for i, val in enumerate(avg_embedding)}
        record['original_index'] = idx
        
        row_df = pd.DataFrame([record], columns=col_names)
        
        try:
            row_df.to_csv(TWEET_EMBEDDINGS_PATH, mode='a', header=False, index=False)
        except Exception as e:
            raise e
    try:
        tweet_df = pd.read_csv(TWEET_EMBEDDINGS_PATH)
        tweet_df = tweet_df.set_index('original_index')
        tweet_df = tweet_df.reindex(data.index, fill_value=0.0) 
    except Exception as e:
        empty_cols = [f'tweet_e_{i}' for i in range(EMBEDDING_DIM)]
        tweet_df = pd.DataFrame(np.zeros((len(data), EMBEDDING_DIM)), index=data.index, columns=empty_cols)
        raise e

In [None]:

from sklearn.preprocessing import StandardScaler

structural_features_cols = [
    'pm.followers_count', 
    'pm.following_count', 
    'pm.listed_count', 
    'pm.tweet_count',
    'account_age_days', 
    'default_profile_image', 
    'protected',             
    'avg_like', 
    'avg_retweet', 
    'avg_reply', 
    'avg_quote', 
    'n_tweets', 
    'unique_text_ratio'
]

if 'verified' in data.columns:
    structural_features_cols.append('verified')

existing_structural_cols = [c for c in structural_features_cols if c in data.columns]

X_struct = data[existing_structural_cols].copy()

for col in ['default_profile_image', 'protected', 'verified']:
    if col in X_struct.columns:
        X_struct[col] = X_struct[col].astype(int)

X_struct = X_struct.fillna(0.0)

X_struct.head(2)

In [None]:
from sklearn.model_selection import train_test_split

X_full = pd.concat([X_struct, bio_df, tweet_df], axis=1)
y = data['y'].astype(int)

X_full.columns = X_full.columns.astype(str).str.replace(r'[\\\",\\\\[\\\\]<>]', '_', regex=True)


X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_full, y,
    test_size=0.2,                
    random_state=RANDOM_STATE,
    stratify=y                    
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.25,               
    random_state=RANDOM_STATE,
    stratify=y_train_val          
)

log(f"Train shapes: X={X_train.shape}, y={y_train.shape}")
log(f"Validation shapes: X={X_val.shape}, y={y_val.shape}") 
log(f"Test shapes: X={X_test.shape}, y={y_test.shape}") 

log(f"Distribuzione 'bot' in y_train: {y_train.mean():.2%}")
log(f"Distribuzione 'bot' in y_val:   {y_val.mean():.2%}") 
log(f"Distribuzione 'bot' in y_test:  {y_test.mean():.2%}")

In [None]:
from xgboost import XGBClassifier
import time

try:
    counts = y_train.value_counts()
    scale_pos_weight = counts[0] / counts[1]
except KeyError:
    scale_pos_weight = 1.0

xgb_model = XGBClassifier(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    tree_method='hist',
    early_stopping_rounds=30
)

start_time = time.time()
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)], 
    verbose=100                
)
end_time = time.time()

In [None]:
import joblib
MODEL_SAVE_PATH = os.path.join(DATA_DIR, "xgb_model_trained.joblib")
log(f"Salvataggio del modello in: {MODEL_SAVE_PATH}")
try:
    joblib.dump(xgb_model, MODEL_SAVE_PATH)
except Exception as e:
    raise e

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix,
    precision_recall_curve, average_precision_score, balanced_accuracy_score,
    matthews_corrcoef, cohen_kappa_score, brier_score_loss, roc_curve
)
import xgboost as xgb

def expected_calibration_error(y_true, y_prob, n_bins=10):
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        lo, hi = bins[i], bins[i+1]
        mask = (y_prob >= lo) & (y_prob < hi) if i < n_bins - 1 else (y_prob >= lo) & (y_prob <= hi)
        if not np.any(mask):
            continue
        bin_acc = y_true[mask].mean()                    
        bin_conf = y_prob[mask].mean()                   
        ece += (mask.mean()) * abs(bin_conf - bin_acc)   
    return ece

def plot_confusion_matrix(cm, labels=('Human','Bot'), normalize=False, title='Confusion Matrix'):
    if normalize:
        with np.errstate(all='ignore'):
            cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
            cm_to_plot = np.nan_to_num(cm_norm)
    else:
        cm_to_plot = cm

    fig, ax = plt.subplots(figsize=(5, 4))
    im = ax.imshow(cm_to_plot, interpolation='nearest')
    ax.set_title(title)
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('True label')
    ax.set_xticks([0,1]); ax.set_xticklabels([f'Pred {labels[0]}', f'Pred {labels[1]}'])
    ax.set_yticks([0,1]); ax.set_yticklabels([f'True {labels[0]}', f'True {labels[1]}'])


    for i in range(cm_to_plot.shape[0]):
        for j in range(cm_to_plot.shape[1]):
            txt = f'{cm_to_plot[i, j]:.2f}' if normalize else f'{int(cm_to_plot[i, j])}'
            ax.text(j, i, txt, ha='center', va='center')
    fig.tight_layout()
    plt.show()

def metrics_at_threshold(y_true, y_prob, thr):
    y_hat = (y_prob >= thr).astype(int)
    acc = accuracy_score(y_true, y_hat)
    f1b = f1_score(y_true, y_hat, pos_label=1)
    cm = confusion_matrix(y_true, y_hat)
    tn, fp, fn, tp = cm.ravel()
    tpr = tp / (tp + fn) if (tp + fn) else 0.0
    fpr = fp / (fp + tn) if (fp + tn) else 0.0
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    spec = tn / (tn + fp) if (tn + fp) else 0.0
    bal_acc = balanced_accuracy_score(y_true, y_hat)
    mcc = matthews_corrcoef(y_true, y_hat)
    kappa = cohen_kappa_score(y_true, y_hat)
    rate_flagged = (y_hat.mean())  # fraction of accounts flagged as bot
    return {
        'threshold': thr, 'accuracy': acc, 'f1_bot': f1b, 'precision': precision,
        'recall_bot': tpr, 'specificity': spec, 'fpr': fpr, 'balanced_acc': bal_acc,
        'mcc': mcc, 'kappa': kappa, 'flagged_rate': rate_flagged, 'cm': cm
    }


def bootstrap_ci(y_true, y_prob, y_hat, n_boot=1000, seed=42):
    rng = np.random.default_rng(seed)
    n = len(y_true)
    accs, f1s, auprs = [], [], []
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    aupr_full = average_precision_score(y_true, y_prob)  

    for _ in range(n_boot):
        idx = rng.integers(0, n, n)
        yt = y_true[idx]
        yp = y_prob[idx]
        yh = y_hat[idx]
        accs.append(accuracy_score(yt, yh))
        f1s.append(f1_score(yt, yh, pos_label=1))
        auprs.append(average_precision_score(yt, yp))
    def ci(a):
        lo, hi = np.percentile(a, [2.5, 97.5])
        return float(np.mean(a)), float(lo), float(hi)
    return {'accuracy': ci(accs), 'f1_bot': ci(f1s), 'aupr': ci(auprs)}


y_pred = xgb_model.predict(X_test)                      
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]    


accuracy = accuracy_score(y_test, y_pred)
f1_bot = f1_score(y_test, y_pred, pos_label=1)
auc_roc = roc_auc_score(y_test, y_pred_proba)
aupr = average_precision_score(y_test, y_pred_proba)
bal_acc = balanced_accuracy_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)


cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
tpr = tp / (tp + fn) if (tp + fn) else 0.0              
spec = tn / (tn + fp) if (tn + fp) else 0.0             
fpr_base = fp / (fp + tn) if (fp + tn) else 0.0
brier = brier_score_loss(y_test, y_pred_proba)
ece = expected_calibration_error(y_test, y_pred_proba, n_bins=10)

log("--- Metriche sul Test Set ---")
log(f"Accuracy:         {accuracy:.4f}")
log(f"F1 Score (Bot):   {f1_bot:.4f}")
log(f"AUC-ROC:          {auc_roc:.4f}")
log(f"AUCPR (Bot):      {aupr:.4f}")
log(f"Balanced Acc:     {bal_acc:.4f}")
log(f"Specificity (TNR):{spec:.4f}")
log(f"MCC:              {mcc:.4f}")
log(f"Cohen's kappa:    {kappa:.4f}")
log(f"Brier score:      {brier:.4f}")
log(f"ECE (10 bins):    {ece:.4f}")

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Human (0)', 'Bot (1)']))

print("\n--- Confusion Matrix (counts) ---")
print(cm)
plot_confusion_matrix(cm, labels=('Human','Bot'), normalize=False, title='Confusion Matrix (counts)')
plot_confusion_matrix(cm, labels=('Human','Bot'), normalize=True,  title='Confusion Matrix (row-normalized)')


fpr, tpr_curve, roc_thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr_curve)

fig = plt.figure(figsize=(5,4))
plt.plot(fpr, tpr_curve, label=f'ROC (AUC={roc_auc:.3f})')
plt.plot([0,1],[0,1], linestyle='--')
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Bot = positive)')
plt.legend()
plt.tight_layout()
plt.show()


precision, recall, pr_thresholds = precision_recall_curve(y_test, y_pred_proba)
aupr_viz = average_precision_score(y_test, y_pred_proba)

fig = plt.figure(figsize=(5,4))
plt.plot(recall, precision, label=f'PR (AP={aupr_viz:.3f})')
plt.xlabel('Recall (Bot)'); plt.ylabel('Precision (Bot)')
plt.title('Precision–Recall Curve (Bot = positive)')
plt.legend()
plt.tight_layout()
plt.show()


n_bins = 10
bins = np.linspace(0.0, 1.0, n_bins+1)
bin_centers = (bins[:-1] + bins[1:]) / 2.0
emp_acc = []
avg_conf = []
for i in range(n_bins):
    lo, hi = bins[i], bins[i+1]
    mask = (y_pred_proba >= lo) & (y_pred_proba < hi) if i < n_bins - 1 else (y_pred_proba >= lo) & (y_pred_proba <= hi)
    if np.any(mask):
        emp_acc.append(y_test[mask].mean())
        avg_conf.append(y_pred_proba[mask].mean())
    else:
        emp_acc.append(np.nan)
        avg_conf.append(np.nan)

emp_acc = np.array(emp_acc); avg_conf = np.array(avg_conf)

fig = plt.figure(figsize=(5,4))
plt.plot([0,1],[0,1], linestyle='--', label='Perfect calibration')
plt.plot(avg_conf, emp_acc, marker='o', label=f'Reliability (ECE={ece:.3f})')
plt.xlim(0,1); plt.ylim(0,1)
plt.xlabel('Mean predicted probability (Bot)')
plt.ylabel('Empirical bot frequency')
plt.title('Reliability Diagram')
plt.legend()
plt.tight_layout()
plt.show()


def closest_idx(arr, target):
    return int(np.argmin(np.abs(arr - target)))


idx_1 = closest_idx(fpr, 0.01)
idx_5 = closest_idx(fpr, 0.05)

thr_1 = roc_thresholds[idx_1]
thr_5 = roc_thresholds[idx_5]

op_1 = metrics_at_threshold(y_test, y_pred_proba, thr_1)
op_5 = metrics_at_threshold(y_test, y_pred_proba, thr_5)


def precision_at_recall(target_recall):
    idxs = np.where(recall >= target_recall)[0]
    if len(idxs) == 0:
        return None
    i = idxs[-1] if idxs[-1] < len(pr_thresholds) else len(pr_thresholds)-1
    thr = pr_thresholds[i] if i < len(pr_thresholds) else 0.5
    return metrics_at_threshold(y_test, y_pred_proba, thr)

op_pr70 = precision_at_recall(0.70)
op_pr80 = precision_at_recall(0.80)

def recall_at_precision(target_precision):
    idxs = np.where(precision >= target_precision)[0]
    if len(idxs) == 0:
        return None
    i = idxs[0]
    thr = pr_thresholds[i-1] if i-1 >= 0 else pr_thresholds[0]
    return metrics_at_threshold(y_test, y_pred_proba, thr)

op_p95 = recall_at_precision(0.95)

print("\n--- Operating Points ---")
def pretty(op, name):
    per10k = int(round(op['flagged_rate'] * 10000))
    print(f"{name}: thr={op['threshold']:.4f} | Prec={op['precision']:.3f} | Rec={op['recall_bot']:.3f} | FPR={op['fpr']:.3f} | MCC={op['mcc']:.3f} | Flagged≈{per10k}/10k")

pretty(op_1,  "TPR @ FPR=1%")
pretty(op_5,  "TPR @ FPR=5%")
pretty(op_pr70, "Precision @ Recall=0.70")
pretty(op_pr80, "Precision @ Recall=0.80")
pretty(op_p95,  "Recall @ Precision=0.95")


booster = xgb_model.get_booster()

raw_imp = booster.get_score(importance_type='gain')  

feat_names = getattr(xgb_model, "feature_names_in_", None)
if feat_names is None:
    feat_names = list(X.columns)

if raw_imp and next(iter(raw_imp)).startswith("f"):
    mapped = {}
    for k, v in raw_imp.items():
        idx = int(k[1:]) if k[1:].isdigit() else None
        mapped[feat_names[idx] if idx is not None and idx < len(feat_names) else k] = v
    raw_imp = mapped

s = pd.Series(raw_imp, dtype="float64")
s = s.reindex(list(feat_names), fill_value=0.0)

pat_tweet = re.compile(r"^tweet(?:_e)?_\d+$") 
pat_bio   = re.compile(r"^bio(?:_e)?_\d+$")

def group_name(name: str) -> str:
    if pat_tweet.match(name):
        return "tweet"
    if pat_bio.match(name):
        return "bio"
    return name 

g = (
    s.groupby([group_name])
     .mean()                
     .sort_values(ascending=False)
     .head(30)
)


fig, ax = plt.subplots(figsize=(12, 8))
g.iloc[::-1].plot(kind="barh", ax=ax)
ax.set_title("Feature Importance (media per embedding) — Top 30")
ax.set_xlabel("Gain")
ax.set_ylabel("")
plt.tight_layout()
plt.show()


DO_BOOTSTRAP = False
if DO_BOOTSTRAP:
    log("Bootstrap CI (potrebbe richiedere tempo)...")
    ci = bootstrap_ci(np.asarray(y_test), np.asarray(y_pred_proba), np.asarray(y_pred), n_boot=1000, seed=42)
    print("\n--- Bootstrap 95% CI ---")
    print(f"Accuracy:  mean={ci['accuracy'][0]:.4f}  95%CI=({ci['accuracy'][1]:.4f}, {ci['accuracy'][2]:.4f})")
    print(f"F1(bot):   mean={ci['f1_bot'][0]:.4f}   95%CI=({ci['f1_bot'][1]:.4f}, {ci['f1_bot'][2]:.4f})")
    print(f"AUCPR:     mean={ci['aupr'][0]:.4f}    95%CI=({ci['aupr'][1]:.4f}, {ci['aupr'][2]:.4f})")
