In [None]:
from math import isnan

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.preprocessing import LabelEncoder
import optuna
import matplotlib.pyplot as plt

In [None]:
# Load data
df_path = 'Datasets/'
users = pd.read_csv(df_path + 'v2_top10_embeddings_users.csv')
emb = np.load(df_path + 'v2_top10_embeddings.npy')
df2 = pd.concat([users, pd.DataFrame(emb)], axis=1)

print(f"Data shape: {df2.shape}")
print(f"Label distribution:\n{df2['label'].value_counts()}")
#df2 = df2.drop(columns=[i for i in range(768)])
df2.head()

In [None]:
#load features
features = pd.read_csv(df_path + 'df_preprocessed.csv')
print(f"Data shape: {features.shape}")
print(f"Label distribution:\n{features['label'].value_counts()}")
features.head()

In [None]:
desc = np.load(df_path + 'description_embeddings.npy')
print(desc.shape)
desc_merged = pd.concat([features, pd.DataFrame(desc)], axis=1)

In [None]:
desc_merged.rename(columns=lambda c: f"D{c}" if str(c).isdigit() else c, inplace=True)
desc_merged.head()

In [None]:
df2['has_embedding'] = 1
df2.rename(columns=lambda c: f"T{c}" if str(c).isdigit() else c, inplace=True)
df2_merged = desc_merged.merge(df2, left_on="id", right_on='author_id',how="left")
df2_merged = df2_merged.fillna(0)
df2_merged.shape

In [None]:
df2_merged.head()

In [None]:
# Load data
df_path = 'Datasets/'
users = pd.read_csv(df_path + 'v2_top10_embeddings_users.csv')
emb = np.load(df_path + 'v2_top10_embeddings.npy')
df2 = pd.concat([users, pd.DataFrame(emb)], axis=1)

print(f"Data shape: {df2.shape}")
print(f"Label distribution:\n{df2['label'].value_counts()}")
#load features
features = pd.read_csv(df_path + 'df_preprocessed.csv')
print(f"Data shape: {features.shape}")
print(f"Label distribution:\n{features['label'].value_counts()}")
desc = np.load(df_path + 'description_embeddings.npy')
print(desc.shape)
desc_merged = pd.concat([features, pd.DataFrame(desc)], axis=1)
desc_merged.rename(columns=lambda c: f"D{c}" if str(c).isdigit() else c, inplace=True)
df2['has_embedding'] = 1
df2.rename(columns=lambda c: f"T{c}" if str(c).isdigit() else c, inplace=True)
df2_merged = desc_merged.merge(df2, left_on="id", right_on='author_id',how="left")
df2_merged = df2_merged.fillna(0)
df2_merged.shape

In [None]:
def prepare_data(df):
    print(f"Dataset shape: {df.shape}")
    #print(f"Class distribution:\n{df['label'].value_counts()}")

    # Get feature columns (the numeric columns 0-383)
    #feature_cols = [i for i in range(df.shape[1]-50)]
    cols_to_remove = ['created_at', 'description', 'entities', 'id', 'location', 'name', 'pinned_tweet_id', 'profile_image_url', 'protected', 'public_metrics', 'url', 'username', 'verified', 'account_age_days', 'author_id', 'label_y', 'split_y', 'withheld', 'n_tweets']
    df = df.drop(columns=cols_to_remove)
    df = df.rename(columns={'label_x': 'label', 'split_x':'split'})

    #X_emb = df[feature_cols]
    #X_features = df[df.columns.difference(cols_to_remove+feature_cols)]
    le = LabelEncoder()
    y = le.fit_transform(df['label'])
    print(df['label'][0], y[0])

    # Split using existing split column
    """X_train_emb = X_emb[df['split'] == 'train']
    X_train_features = X_features[df['split'] == 'train']
    X_train_features = X_train_features.drop(columns=['label', 'split'])"""
    X_train = df[df['split'] == 'train']
    X_train = X_train.drop(columns=['label', 'split'])
    y_train = y[df['split'] == 'train']
    """X_val_emb = X_emb[df['split'] == 'val']
    X_val_features = X_features[df['split'] == 'val']
    X_val_features = X_val_features.drop(columns=['label', 'split'])"""
    X_val = df[df['split'] == 'val']
    X_val = X_val.drop(columns=['label', 'split'])
    y_val = y[df['split'] == 'val']
    """X_test_emb = X_emb[df['split'] == 'test']
    X_test_features = X_features[df['split'] == 'test']
    X_test_features = X_test_features.drop(columns=['label', 'split'])"""
    X_test = df[df['split'] == 'test']
    X_test = X_test.drop(columns=['label', 'split'])
    y_test = y[df['split'] == 'test']

    print(f"\nSplit sizes - Train: {len(y_train)}, Val: {len(y_val)}, Test: {len(y_test)}")
    print(f"Train class distribution: {pd.Series(y_train).value_counts().to_dict()}")
    print(f"Validation class distribution: {pd.Series(y_val).value_counts().to_dict()}")
    print(f"Test class distribution: {pd.Series(y_test).value_counts().to_dict()}")

    return X_train, y_train, X_val, y_val, X_test, y_test, le

In [None]:
def objective(trial):
    # Suggest hyperparameters
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'aucpr',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        #'col_sample_bylevel': trial.suggest_float('col_sample_bylevel', 0.3, 1.0),
        #'col_sample_bynode': trial.suggest_float('col_sample_bynode', 0.3, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'lambda': trial.suggest_float('reg_lambda', 0.1, 20.0, log=True),
        'max_delta_step': trial.suggest_float('max_delta_step', 0.0, 5.0),
        'tree_method': 'hist',
        'scale_pos_weight': scale_pos_weight,
        'seed': 42
    }

    # Build model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dval, 'val')],
        early_stopping_rounds=100,
        verbose_eval=False
    )

    y_pred = (model.predict(dval) >= 0.5).astype(int)

    """precision_neg = precision_score(y_val, y_pred, pos_label=0)
    recall_pos = recall_score(y_val, y_pred, pos_label=1)

    score = 2 * precision_neg * recall_pos / (precision_neg + recall_pos + 1e-9)"""
    f1 = f1_score(y_val, y_pred, pos_label=0)
    return f1


In [None]:
X_train, y_train, X_val, y_val, X_test, y_test, le = prepare_data(df2_merged)

In [None]:
#X_train = pd.concat([X_train_emb, X_train_features], axis=1)
dtrain = xgb.DMatrix(X_train, label=y_train)
#X_val = pd.concat([X_val_emb, X_val_features], axis=1)
dval = xgb.DMatrix(X_val, label=y_val)
#X_test = pd.concat([X_test_emb, X_test_features], axis=1)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
n_pos = np.sum(y_train==1)
n_neg = np.sum(y_train==0)
scale_pos_weight = n_neg / n_pos
print(f"Scale pos weight: {scale_pos_weight:.2f}")

In [None]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',
    'learning_rate': 0.05,
    'scale_pos_weight': scale_pos_weight,
    'seed': 42
}

model = xgb.train(
    params,
    dtrain,
    evals=[(dtrain, 'train'), (dval, 'val')],
    num_boost_round=1000,
    early_stopping_rounds=100,
    verbose_eval=50,
)

In [None]:
y_val_pred = (model.predict(dval) >= 0.5).astype(int)
print(classification_report(y_val, y_val_pred, target_names=['bot', 'human']))

In [None]:
xgb.plot_importance(model, max_num_features=20)
plt.show()

In [None]:
#Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best trial:")
print(study.best_trial.params)
print("Best custom F1:", study.best_trial.value)

In [None]:
# Get best params
best_params = study.best_trial.params
print("Best parameters:", best_params)

In [None]:
# Retrain model with best params
best_params.update({
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',
    #'scale_pos_weight': scale_pos_weight,
    'seed': 42
})
best_model = xgb.train(
    best_params,
    dtrain,
    evals=[(dtrain, 'train'), (dval, 'val')],
    num_boost_round=1000,
    early_stopping_rounds=100,
    verbose_eval=50,
)

In [None]:
xgb.plot_importance(best_model, max_num_features=20)
plt.show()

In [None]:
# Predictions
y_test_pred = (best_model.predict(dtest) >= 0.5).astype(int)

print(classification_report(y_test, y_test_pred, target_names=['bot', 'human']))

In [None]:
y_val_proba = best_model.predict(dval)
y_val_proba

In [None]:
precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)
precision.shape, recall.shape, thresholds.shape

In [None]:
"""valid = precision[:-1] >= 0.76
valid_recalls = recall[:-1][valid]
valid_thresholds = thresholds[valid]

best_idx = np.argmax(valid_recalls)
best_threshold = valid_thresholds[best_idx]"""

f1=2*precision*recall/(precision+recall+1e-9)
best_threshold = thresholds[np.argmax(f1)]

print(best_threshold)

In [None]:
y_val_proba = best_model.predict(dval)
y_val_pred = (y_val_proba >= best_threshold).astype(int)
print(classification_report(y_val, y_val_pred, target_names=['bot', 'human']))

In [None]:
y_test_proba = best_model.predict(dtest)
y_test_pred = (y_test_proba >= best_threshold).astype(int)
print(classification_report(y_test, y_test_pred, target_names=['bot', 'human']))