# Hyperparameter Tuning for Phishing Detection Models

Optuna-based hyperparameter optimization for:
- TF-IDF + Logistic Regression
- XGBoost Hybrid
- MLP Hybrid

**Objective:** Maximize Recall + ROC-AUC (catch phishing + good ranking)

## Setup and Imports

In [1]:
import os
import json
import numpy as np
import polars as pl
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from scipy.sparse import hstack
from sentence_transformers import SentenceTransformer
import xgboost as xgb
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
import warnings
warnings.filterwarnings('ignore')

print(f"Optuna: {optuna.__version__}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

  from .autonotebook import tqdm as notebook_tqdm


Optuna: 4.6.0
Device: cuda


## Configuration

In [2]:
config = {
    "random_seed": 42,
    "data_source": "../../data/emails_v5.parquet",
    "numeric_features": [
        "sender_domain_entropy", "has_attachment",
        "spf_flag_missing", "dkim_flag_missing",
        "num_links", "subject_length", "body_length", "keyword_count", 
        "num_received_headers", "num_exclamation_marks", 
        "num_malicious_links"
    ],
    "embedding_model": "all-MiniLM-L6-v2",
    "embedding_dim": 384,
    "n_trials_tfidf": 30,
    "n_trials_xgb": 30,
    "n_trials_mlp": 20
}

In [None]:
print("Loading dataset...")
df = pl.read_parquet(config["data_source"])
print(f"Total records: {len(df):,}")

external_df = df.filter(
    (pl.col("source") == "csv_misc/TREC-07.csv") | (pl.col("source") == "phishing-2020")
)

train_pool_df = df.filter(
    (pl.col("source") != "csv_misc/TREC-07.csv") & (pl.col("source") != "phishing-2020")
)

train_pool_pd = train_pool_df.to_pandas()
train_df_pd, test_df_pd = train_test_split(
    train_pool_pd, 
    test_size=0.25, 
    random_state=config["random_seed"], 
    stratify=train_pool_pd['phishing']
)

train_df = pl.from_pandas(train_df_pd)
test_df = pl.from_pandas(test_df_pd)

print(f"\nSplits - Train: {len(train_df):,} | Test: {len(test_df):,} | External: {len(external_df):,}")

Loading dataset...
Total records: 212,113
Total records: 212,113

Splits - Train: 119,121 | Test: 39,707 | External: 53,285

Splits - Train: 119,121 | Test: 39,707 | External: 53,285


## Extract Features

In [4]:
numeric_features = config["numeric_features"]

features_train = train_df.select(numeric_features).fill_null(0).to_numpy()
features_test = test_df.select(numeric_features).fill_null(0).to_numpy()

y_train = train_df['phishing'].to_numpy()
y_test = test_df['phishing'].to_numpy()

print(f"Features shape: {features_train.shape}")

Features shape: (119121, 11)


## Generate Embeddings

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer(config["embedding_model"], device=device)

print("Encoding train set...")
X_emb_train = embedding_model.encode(
    train_df['body_subject'].to_list(), 
    show_progress_bar=True, 
    convert_to_numpy=True,
    batch_size=32
)

print("Encoding test set...")
X_emb_test = embedding_model.encode(
    test_df['body_subject'].to_list(), 
    show_progress_bar=True, 
    convert_to_numpy=True,
    batch_size=32
)

print(f"Embeddings shape: {X_emb_train.shape}")

Encoding train set...


Batches: 100%|██████████| 3723/3723 [02:24<00:00, 25.69it/s] 



Encoding test set...


Batches: 100%|██████████| 1241/1241 [00:48<00:00, 25.65it/s] 



Embeddings shape: (119121, 384)


## TF-IDF Tuning

In [None]:
def tfidf_objective(trial):
    params = {
        'max_features': trial.suggest_int('max_features', 3000, 10000, step=1000),
        'min_df': trial.suggest_int('min_df', 2, 10),
        'max_df': trial.suggest_float('max_df', 0.5, 0.95, step=0.05),
        'ngram_max': trial.suggest_int('ngram_max', 1, 3),
        'C': trial.suggest_float('C', 0.1, 10.0, log=True),
        'class_weight_ratio': trial.suggest_float('class_weight_ratio', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }
    
    vectorizer = TfidfVectorizer(
        max_features=params['max_features'],
        ngram_range=(1, params['ngram_max']),
        min_df=params['min_df'],
        max_df=params['max_df']
    )
    
    X_tfidf_train = vectorizer.fit_transform(train_df['body_subject'].to_list())
    X_tfidf_test = vectorizer.transform(test_df['body_subject'].to_list())
    
    X_combined_train = hstack([X_tfidf_train, features_train])
    X_combined_test = hstack([X_tfidf_test, features_test])
    
    clf = LogisticRegression(
        max_iter=1000,
        C=params['C'],
        class_weight={0: 1.0, 1: params['class_weight_ratio']},
        random_state=42,
        verbose=0
    )
    clf.fit(X_combined_train, y_train)
    
    y_proba = clf.predict_proba(X_combined_test)[:, 1]
    y_pred = (y_proba >= params['threshold']).astype(int)
    
    recall = recall_score(y_test, y_pred, zero_division=0)
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    
    return 0.5 * recall + 0.5 * roc_auc

In [7]:
print("Starting TF-IDF tuning...")

wandb_callback = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "tfidf"]}
)

tfidf_study = optuna.create_study(
    direction="maximize",
    study_name="tfidf-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

tfidf_study.optimize(
    tfidf_objective,
    n_trials=config["n_trials_tfidf"],
    callbacks=[wandb_callback],
    show_progress_bar=True
)

print(f"\nBest score: {tfidf_study.best_trial.value:.4f}")
print(f"Recall: {tfidf_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {tfidf_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {tfidf_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {tfidf_study.best_trial.user_attrs['f1_score']:.4f}")
print("\nBest params:")
for k, v in tfidf_study.best_trial.params.items():
    print(f"  {k}: {v}")

Starting TF-IDF tuning...


[34m[1mwandb[0m: Currently logged in as: [33mjastrzeb-michal[0m ([33mlatandu-phishstop[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[I 2025-11-28 19:38:48,627] A new study created in memory with name: tfidf-tuning
Best trial: 0. Best value: 0.994736:   3%|▎         | 1/30 [01:15<36:29, 75.51s/it]

[I 2025-11-28 19:40:04,135] Trial 0 finished with value: 0.994736473044562 and parameters: {'max_features': 5000, 'min_df': 10, 'max_df': 0.8500000000000001, 'ngram_max': 2, 'C': 0.20513382630874505, 'class_weight_ratio': 2.0, 'threshold': 0.2}. Best is trial 0 with value: 0.994736473044562.


Best trial: 1. Best value: 0.996445:   7%|▋         | 2/30 [01:40<21:20, 45.75s/it]

[I 2025-11-28 19:40:29,052] Trial 1 finished with value: 0.9964450379799736 and parameters: {'max_features': 9000, 'min_df': 7, 'max_df': 0.8500000000000001, 'ngram_max': 1, 'C': 8.706020878304859, 'class_weight_ratio': 8.5, 'threshold': 0.25}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 1. Best value: 0.996445:  10%|█         | 3/30 [02:59<27:23, 60.87s/it]

[I 2025-11-28 19:41:47,917] Trial 2 finished with value: 0.993421510247612 and parameters: {'max_features': 4000, 'min_df': 3, 'max_df': 0.65, 'ngram_max': 2, 'C': 0.7309539835912913, 'class_weight_ratio': 3.5, 'threshold': 0.4}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 1. Best value: 0.996445:  13%|█▎        | 4/30 [04:27<30:59, 71.53s/it]

[I 2025-11-28 19:43:15,783] Trial 3 finished with value: 0.9927690158059053 and parameters: {'max_features': 4000, 'min_df': 4, 'max_df': 0.65, 'ngram_max': 2, 'C': 3.7183641805732095, 'class_weight_ratio': 2.5, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 1. Best value: 0.996445:  17%|█▋        | 5/30 [04:52<22:54, 54.96s/it]

[I 2025-11-28 19:43:41,374] Trial 4 finished with value: 0.9956788858267935 and parameters: {'max_features': 7000, 'min_df': 2, 'max_df': 0.8, 'ngram_max': 1, 'C': 0.1349283426801325, 'class_weight_ratio': 10.0, 'threshold': 0.5}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 1. Best value: 0.996445:  20%|██        | 6/30 [07:40<37:15, 93.15s/it]

[I 2025-11-28 19:46:28,650] Trial 5 finished with value: 0.9937040170083618 and parameters: {'max_features': 9000, 'min_df': 4, 'max_df': 0.5, 'ngram_max': 3, 'C': 0.7591104805282696, 'class_weight_ratio': 2.0, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 1. Best value: 0.996445:  23%|██▎       | 7/30 [08:51<32:58, 86.04s/it]

[I 2025-11-28 19:47:40,038] Trial 6 finished with value: 0.9944951515893883 and parameters: {'max_features': 3000, 'min_df': 10, 'max_df': 0.6, 'ngram_max': 2, 'C': 0.420167205437253, 'class_weight_ratio': 5.5, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 1. Best value: 0.996445:  27%|██▋       | 8/30 [11:22<39:09, 106.82s/it]

[I 2025-11-28 19:50:11,354] Trial 7 finished with value: 0.9925939652040557 and parameters: {'max_features': 4000, 'min_df': 10, 'max_df': 0.8500000000000001, 'ngram_max': 3, 'C': 6.161049539380966, 'class_weight_ratio': 6.5, 'threshold': 0.5}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 1. Best value: 0.996445:  30%|███       | 9/30 [11:44<28:03, 80.15s/it] 

[I 2025-11-28 19:50:32,876] Trial 8 finished with value: 0.9915856979640528 and parameters: {'max_features': 3000, 'min_df': 3, 'max_df': 0.5, 'ngram_max': 1, 'C': 0.59890036722543, 'class_weight_ratio': 3.5, 'threshold': 0.45}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 1. Best value: 0.996445:  33%|███▎      | 10/30 [12:07<20:48, 62.43s/it]

[I 2025-11-28 19:50:55,630] Trial 9 finished with value: 0.9882342672184913 and parameters: {'max_features': 5000, 'min_df': 4, 'max_df': 0.75, 'ngram_max': 1, 'C': 4.02155452669029, 'class_weight_ratio': 1.5, 'threshold': 0.5}. Best is trial 1 with value: 0.9964450379799736.


Best trial: 10. Best value: 0.997433:  37%|███▋      | 11/30 [12:30<15:57, 50.39s/it]

[I 2025-11-28 19:51:18,729] Trial 10 finished with value: 0.9974328587624923 and parameters: {'max_features': 10000, 'min_df': 7, 'max_df': 0.95, 'ngram_max': 1, 'C': 2.012487082975094, 'class_weight_ratio': 9.0, 'threshold': 0.2}. Best is trial 10 with value: 0.9974328587624923.


Best trial: 11. Best value: 0.997513:  40%|████      | 12/30 [12:53<12:38, 42.16s/it]

[I 2025-11-28 19:51:42,039] Trial 11 finished with value: 0.9975129685676964 and parameters: {'max_features': 10000, 'min_df': 7, 'max_df': 0.95, 'ngram_max': 1, 'C': 2.1458782520594686, 'class_weight_ratio': 9.0, 'threshold': 0.2}. Best is trial 11 with value: 0.9975129685676964.


Best trial: 11. Best value: 0.997513:  43%|████▎     | 13/30 [13:16<10:20, 36.52s/it]

[I 2025-11-28 19:52:05,577] Trial 12 finished with value: 0.9969198881585042 and parameters: {'max_features': 10000, 'min_df': 7, 'max_df': 0.95, 'ngram_max': 1, 'C': 2.2409315737561504, 'class_weight_ratio': 8.0, 'threshold': 0.25}. Best is trial 11 with value: 0.9975129685676964.


Best trial: 13. Best value: 0.99778:  47%|████▋     | 14/30 [13:40<08:40, 32.52s/it] 

[I 2025-11-28 19:52:28,855] Trial 13 finished with value: 0.9977798663691383 and parameters: {'max_features': 8000, 'min_df': 8, 'max_df': 0.95, 'ngram_max': 1, 'C': 1.3641825102181804, 'class_weight_ratio': 10.0, 'threshold': 0.2}. Best is trial 13 with value: 0.9977798663691383.


Best trial: 13. Best value: 0.99778:  50%|█████     | 15/30 [14:52<11:07, 44.48s/it]

[I 2025-11-28 19:53:41,045] Trial 14 finished with value: 0.9972883094612066 and parameters: {'max_features': 8000, 'min_df': 8, 'max_df': 0.95, 'ngram_max': 2, 'C': 1.5192022573669297, 'class_weight_ratio': 10.0, 'threshold': 0.25}. Best is trial 13 with value: 0.9977798663691383.


Best trial: 13. Best value: 0.99778:  53%|█████▎    | 16/30 [15:17<09:01, 38.69s/it]

[I 2025-11-28 19:54:06,287] Trial 15 finished with value: 0.9968470933397493 and parameters: {'max_features': 7000, 'min_df': 8, 'max_df': 0.9, 'ngram_max': 1, 'C': 1.1729056439972787, 'class_weight_ratio': 7.0, 'threshold': 0.30000000000000004}. Best is trial 13 with value: 0.9977798663691383.


Best trial: 13. Best value: 0.99778:  57%|█████▋    | 17/30 [15:40<07:20, 33.89s/it]

[I 2025-11-28 19:54:29,026] Trial 16 finished with value: 0.997583891564437 and parameters: {'max_features': 8000, 'min_df': 6, 'max_df': 0.9, 'ngram_max': 1, 'C': 0.35350398321438675, 'class_weight_ratio': 5.5, 'threshold': 0.2}. Best is trial 13 with value: 0.9977798663691383.


Best trial: 13. Best value: 0.99778:  60%|██████    | 18/30 [18:09<13:43, 68.66s/it]

[I 2025-11-28 19:56:58,615] Trial 17 finished with value: 0.9965261319928477 and parameters: {'max_features': 8000, 'min_df': 5, 'max_df': 0.75, 'ngram_max': 3, 'C': 0.2748057753914886, 'class_weight_ratio': 5.0, 'threshold': 0.30000000000000004}. Best is trial 13 with value: 0.9977798663691383.


Best trial: 13. Best value: 0.99778:  63%|██████▎   | 19/30 [18:35<10:13, 55.76s/it]

[I 2025-11-28 19:57:24,334] Trial 18 finished with value: 0.9955835978163017 and parameters: {'max_features': 6000, 'min_df': 6, 'max_df': 0.9, 'ngram_max': 1, 'C': 0.10066897143884218, 'class_weight_ratio': 4.5, 'threshold': 0.30000000000000004}. Best is trial 13 with value: 0.9977798663691383.


Best trial: 13. Best value: 0.99778:  67%|██████▋   | 20/30 [19:49<10:11, 61.20s/it]

[I 2025-11-28 19:58:38,198] Trial 19 finished with value: 0.9971816756508902 and parameters: {'max_features': 8000, 'min_df': 9, 'max_df': 0.8, 'ngram_max': 2, 'C': 0.35786662217765414, 'class_weight_ratio': 7.0, 'threshold': 0.25}. Best is trial 13 with value: 0.9977798663691383.


Best trial: 13. Best value: 0.99778:  70%|███████   | 21/30 [21:02<09:41, 64.64s/it]

[I 2025-11-28 19:59:50,868] Trial 20 finished with value: 0.9971948854408378 and parameters: {'max_features': 9000, 'min_df': 6, 'max_df': 0.9, 'ngram_max': 2, 'C': 0.4663915875553889, 'class_weight_ratio': 4.0, 'threshold': 0.2}. Best is trial 13 with value: 0.9977798663691383.


Best trial: 21. Best value: 0.997956:  73%|███████▎  | 22/30 [21:25<06:58, 52.29s/it]

[I 2025-11-28 20:00:14,363] Trial 21 finished with value: 0.9979556113986807 and parameters: {'max_features': 10000, 'min_df': 8, 'max_df': 0.95, 'ngram_max': 1, 'C': 1.299304926078878, 'class_weight_ratio': 9.0, 'threshold': 0.2}. Best is trial 21 with value: 0.9979556113986807.


Best trial: 21. Best value: 0.997956:  77%|███████▋  | 23/30 [21:48<05:04, 43.50s/it]

[I 2025-11-28 20:00:37,352] Trial 22 finished with value: 0.9975236734577518 and parameters: {'max_features': 7000, 'min_df': 8, 'max_df': 0.9, 'ngram_max': 1, 'C': 1.0557785263494976, 'class_weight_ratio': 6.0, 'threshold': 0.2}. Best is trial 21 with value: 0.9979556113986807.


Best trial: 21. Best value: 0.997956:  80%|████████  | 24/30 [22:11<03:44, 37.35s/it]

[I 2025-11-28 20:01:00,373] Trial 23 finished with value: 0.9971011820830136 and parameters: {'max_features': 9000, 'min_df': 9, 'max_df': 0.95, 'ngram_max': 1, 'C': 1.4070480004646564, 'class_weight_ratio': 7.5, 'threshold': 0.25}. Best is trial 21 with value: 0.9979556113986807.


Best trial: 21. Best value: 0.997956:  83%|████████▎ | 25/30 [22:34<02:45, 33.11s/it]

[I 2025-11-28 20:01:23,586] Trial 24 finished with value: 0.9972862666428994 and parameters: {'max_features': 8000, 'min_df': 5, 'max_df': 0.8, 'ngram_max': 1, 'C': 2.968835783743786, 'class_weight_ratio': 9.5, 'threshold': 0.2}. Best is trial 21 with value: 0.9979556113986807.


Best trial: 21. Best value: 0.997956:  87%|████████▋ | 26/30 [22:57<01:59, 29.96s/it]

[I 2025-11-28 20:01:46,201] Trial 25 finished with value: 0.9968119594754972 and parameters: {'max_features': 6000, 'min_df': 6, 'max_df': 0.8500000000000001, 'ngram_max': 1, 'C': 0.8617105904678982, 'class_weight_ratio': 8.0, 'threshold': 0.30000000000000004}. Best is trial 21 with value: 0.9979556113986807.


Best trial: 21. Best value: 0.997956:  90%|█████████ | 27/30 [23:20<01:23, 27.91s/it]

[I 2025-11-28 20:02:09,315] Trial 26 finished with value: 0.9970735576971775 and parameters: {'max_features': 9000, 'min_df': 9, 'max_df': 0.9, 'ngram_max': 1, 'C': 0.18379621110561523, 'class_weight_ratio': 6.0, 'threshold': 0.25}. Best is trial 21 with value: 0.9979556113986807.


Best trial: 21. Best value: 0.997956:  93%|█████████▎| 28/30 [24:32<01:22, 41.01s/it]

[I 2025-11-28 20:03:20,896] Trial 27 finished with value: 0.996804712365439 and parameters: {'max_features': 10000, 'min_df': 8, 'max_df': 0.95, 'ngram_max': 2, 'C': 0.5194925393280307, 'class_weight_ratio': 9.0, 'threshold': 0.4}. Best is trial 21 with value: 0.9979556113986807.


Best trial: 21. Best value: 0.997956:  97%|█████████▋| 29/30 [24:55<00:35, 35.63s/it]

[I 2025-11-28 20:03:43,966] Trial 28 finished with value: 0.9976987201436557 and parameters: {'max_features': 8000, 'min_df': 5, 'max_df': 0.7, 'ngram_max': 1, 'C': 0.3274111199820186, 'class_weight_ratio': 8.0, 'threshold': 0.2}. Best is trial 21 with value: 0.9979556113986807.


Best trial: 21. Best value: 0.997956: 100%|██████████| 30/30 [26:05<00:00, 52.17s/it]

[I 2025-11-28 20:04:53,793] Trial 29 finished with value: 0.9973318247544363 and parameters: {'max_features': 6000, 'min_df': 5, 'max_df': 0.7, 'ngram_max': 2, 'C': 1.6003914397787575, 'class_weight_ratio': 10.0, 'threshold': 0.2}. Best is trial 21 with value: 0.9979556113986807.

Best score: 0.9980
Recall: 0.9984
ROC-AUC: 0.9975
Precision: 0.8430
F1: 0.9141

Best params:
  max_features: 10000
  min_df: 8
  max_df: 0.95
  ngram_max: 1
  C: 1.299304926078878
  class_weight_ratio: 9.0
  threshold: 0.2





In [8]:
fig = optuna.visualization.plot_optimization_history(tfidf_study)
fig.show()

fig = optuna.visualization.plot_param_importances(tfidf_study)
fig.show()

## XGBoost Tuning

In [None]:
def xgboost_objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }
    
    X_train = np.concatenate([X_emb_train, features_train], axis=1)
    X_test = np.concatenate([X_emb_test, features_test], axis=1)
    
    clf = xgb.XGBClassifier(
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        min_child_weight=params['min_child_weight'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        scale_pos_weight=params['scale_pos_weight'],
        random_state=42,
        eval_metric='logloss',
        early_stopping_rounds=10,
        verbosity=0
    )
    
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    y_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= params['threshold']).astype(int)
    
    recall = recall_score(y_test, y_pred, zero_division=0)
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    
    return 0.5 * recall + 0.5 * roc_auc

In [10]:
print("Starting XGBoost tuning...")

wandb_callback_xgb = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "xgboost"]}
)

xgb_study = optuna.create_study(
    direction="maximize",
    study_name="xgboost-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

xgb_study.optimize(
    xgboost_objective,
    n_trials=config["n_trials_xgb"],
    callbacks=[wandb_callback_xgb],
    show_progress_bar=True
)

print(f"\nBest score: {xgb_study.best_trial.value:.4f}")
print(f"Recall: {xgb_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {xgb_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {xgb_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {xgb_study.best_trial.user_attrs['f1_score']:.4f}")
print("\nBest params:")
for k, v in xgb_study.best_trial.params.items():
    print(f"  {k}: {v}")

Starting XGBoost tuning...


0,1
C,▁█▂▄▁▂▁▆▁▄▃▃▃▂▂▂▁▁▁▁▁▂▂▂▃▂▁▁▁▂
class_weight_ratio,▁▇▃▂█▁▄▅▃▁▇▇▆██▆▄▄▃▆▃▇▅▆█▆▅▇▆█
max_df,▆▆▃▃▆▁▃▆▁▅█████▇▇▅▇▆▇█▇█▆▆▇█▄▄
max_features,▃▇▂▂▅▇▁▂▁▃███▆▆▅▆▆▄▆▇█▅▇▆▄▇█▆▄
min_df,█▅▂▃▁▃██▂▃▅▅▅▆▆▆▅▄▅▇▅▆▆▇▄▅▇▆▄▄
ngram_max,▅▁▅▅▁█▅█▁▁▁▁▁▁▅▁▁█▁▅▅▁▁▁▁▁▁▅▁▅
objective_score,▆▇▅▄▆▅▆▄▃▁██▇██▇█▇▆▇▇██▇█▇▇▇██
threshold,▁▂▆▄█▄▄█▇█▁▁▂▁▂▃▁▃▃▂▁▁▁▂▁▃▂▆▁▁

0,1
C,1.60039
class_weight_ratio,10.0
max_df,0.7
max_features,6000.0
min_df,5.0
ngram_max,2.0
objective_score,0.99733
threshold,0.2


[I 2025-11-28 20:05:00,001] A new study created in memory with name: xgboost-tuning
Best trial: 0. Best value: 0.98696:   3%|▎         | 1/30 [01:11<34:26, 71.26s/it]

[I 2025-11-28 20:06:11,250] Trial 0 finished with value: 0.9869602103896618 and parameters: {'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.1205712628744377, 'min_child_weight': 6, 'subsample': 0.65, 'colsample_bytree': 0.65, 'scale_pos_weight': 1.5, 'threshold': 0.5}. Best is trial 0 with value: 0.9869602103896618.


Best trial: 1. Best value: 0.99346:   7%|▋         | 2/30 [03:52<57:59, 124.25s/it]

[I 2025-11-28 20:08:52,606] Trial 1 finished with value: 0.9934603055210343 and parameters: {'max_depth': 9, 'n_estimators': 400, 'learning_rate': 0.010725209743171996, 'min_child_weight': 10, 'subsample': 0.95, 'colsample_bytree': 0.65, 'scale_pos_weight': 2.5, 'threshold': 0.25}. Best is trial 1 with value: 0.9934603055210343.


Best trial: 2. Best value: 0.993976:  10%|█         | 3/30 [04:45<41:12, 91.59s/it] 

[I 2025-11-28 20:09:45,316] Trial 2 finished with value: 0.9939762117789348 and parameters: {'max_depth': 6, 'n_estimators': 300, 'learning_rate': 0.04345454109729477, 'min_child_weight': 3, 'subsample': 0.85, 'colsample_bytree': 0.65, 'scale_pos_weight': 3.5, 'threshold': 0.30000000000000004}. Best is trial 2 with value: 0.9939762117789348.


Best trial: 3. Best value: 0.99564:  13%|█▎        | 4/30 [06:25<41:10, 95.01s/it] 

[I 2025-11-28 20:11:25,581] Trial 3 finished with value: 0.9956395769118351 and parameters: {'max_depth': 7, 'n_estimators': 450, 'learning_rate': 0.019721610970574007, 'min_child_weight': 6, 'subsample': 0.85, 'colsample_bytree': 0.6, 'scale_pos_weight': 6.5, 'threshold': 0.25}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  17%|█▋        | 5/30 [07:06<31:30, 75.61s/it]

[I 2025-11-28 20:12:06,778] Trial 4 finished with value: 0.9931190641443381 and parameters: {'max_depth': 3, 'n_estimators': 500, 'learning_rate': 0.26690431824362526, 'min_child_weight': 9, 'subsample': 0.7, 'colsample_bytree': 0.6, 'scale_pos_weight': 7.5, 'threshold': 0.35000000000000003}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  20%|██        | 6/30 [07:44<25:02, 62.61s/it]

[I 2025-11-28 20:12:44,160] Trial 5 finished with value: 0.9853849488123251 and parameters: {'max_depth': 4, 'n_estimators': 300, 'learning_rate': 0.011240768803005551, 'min_child_weight': 10, 'subsample': 0.7, 'colsample_bytree': 0.85, 'scale_pos_weight': 3.5, 'threshold': 0.35000000000000003}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  23%|██▎       | 7/30 [08:21<20:47, 54.22s/it]

[I 2025-11-28 20:13:21,101] Trial 6 finished with value: 0.9890975361287755 and parameters: {'max_depth': 8, 'n_estimators': 150, 'learning_rate': 0.27051668818999286, 'min_child_weight': 8, 'subsample': 1.0, 'colsample_bytree': 1.0, 'scale_pos_weight': 6.5, 'threshold': 0.5}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  27%|██▋       | 8/30 [08:38<15:36, 42.57s/it]

[I 2025-11-28 20:13:38,735] Trial 7 finished with value: 0.9775261158330454 and parameters: {'max_depth': 3, 'n_estimators': 150, 'learning_rate': 0.011662890273931383, 'min_child_weight': 4, 'subsample': 0.75, 'colsample_bytree': 0.7, 'scale_pos_weight': 8.5, 'threshold': 0.30000000000000004}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  30%|███       | 9/30 [09:26<15:26, 44.14s/it]

[I 2025-11-28 20:14:26,325] Trial 8 finished with value: 0.9929046589414979 and parameters: {'max_depth': 5, 'n_estimators': 300, 'learning_rate': 0.016149614799999188, 'min_child_weight': 9, 'subsample': 0.6, 'colsample_bytree': 1.0, 'scale_pos_weight': 8.0, 'threshold': 0.25}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  33%|███▎      | 10/30 [10:03<14:00, 42.02s/it]

[I 2025-11-28 20:15:03,588] Trial 9 finished with value: 0.9945656529693154 and parameters: {'max_depth': 3, 'n_estimators': 450, 'learning_rate': 0.11069143219393454, 'min_child_weight': 8, 'subsample': 0.9, 'colsample_bytree': 0.6, 'scale_pos_weight': 4.0, 'threshold': 0.2}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  37%|███▋      | 11/30 [15:41<41:58, 132.54s/it]

[I 2025-11-28 20:20:41,391] Trial 10 finished with value: 0.9898773072882222 and parameters: {'max_depth': 12, 'n_estimators': 400, 'learning_rate': 0.02847748683027988, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': 10.0, 'threshold': 0.4}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  40%|████      | 12/30 [17:48<39:16, 130.92s/it]

[I 2025-11-28 20:22:48,615] Trial 11 finished with value: 0.9938155907840986 and parameters: {'max_depth': 10, 'n_estimators': 400, 'learning_rate': 0.09191338148828515, 'min_child_weight': 6, 'subsample': 0.9, 'colsample_bytree': 0.75, 'scale_pos_weight': 5.0, 'threshold': 0.2}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 3. Best value: 0.99564:  43%|████▎     | 13/30 [19:08<32:42, 115.47s/it]

[I 2025-11-28 20:24:08,515] Trial 12 finished with value: 0.9946007085146547 and parameters: {'max_depth': 7, 'n_estimators': 450, 'learning_rate': 0.11554733318395786, 'min_child_weight': 7, 'subsample': 0.85, 'colsample_bytree': 0.6, 'scale_pos_weight': 5.0, 'threshold': 0.2}. Best is trial 3 with value: 0.9956395769118351.


Best trial: 13. Best value: 0.996157:  47%|████▋     | 14/30 [20:47<29:25, 110.36s/it]

[I 2025-11-28 20:25:47,073] Trial 13 finished with value: 0.9961567108186926 and parameters: {'max_depth': 8, 'n_estimators': 350, 'learning_rate': 0.025120557149506666, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.9, 'scale_pos_weight': 6.0, 'threshold': 0.2}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  50%|█████     | 15/30 [23:01<29:23, 117.60s/it]

[I 2025-11-28 20:28:01,450] Trial 14 finished with value: 0.9945234834560694 and parameters: {'max_depth': 10, 'n_estimators': 250, 'learning_rate': 0.024353790437615694, 'min_child_weight': 4, 'subsample': 0.8, 'colsample_bytree': 0.9, 'scale_pos_weight': 6.5, 'threshold': 0.25}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  53%|█████▎    | 16/30 [24:41<26:10, 112.17s/it]

[I 2025-11-28 20:29:41,004] Trial 15 finished with value: 0.9948761430775845 and parameters: {'max_depth': 8, 'n_estimators': 350, 'learning_rate': 0.025322427778819726, 'min_child_weight': 5, 'subsample': 0.75, 'colsample_bytree': 0.9, 'scale_pos_weight': 6.5, 'threshold': 0.30000000000000004}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  57%|█████▋    | 17/30 [27:53<29:31, 136.24s/it]

[I 2025-11-28 20:32:53,238] Trial 16 finished with value: 0.9898881218247554 and parameters: {'max_depth': 12, 'n_estimators': 250, 'learning_rate': 0.0499761871245459, 'min_child_weight': 2, 'subsample': 0.85, 'colsample_bytree': 0.9, 'scale_pos_weight': 9.5, 'threshold': 0.4}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  60%|██████    | 18/30 [28:16<20:29, 102.43s/it]

[I 2025-11-28 20:33:16,949] Trial 17 finished with value: 0.9916991990377008 and parameters: {'max_depth': 7, 'n_estimators': 100, 'learning_rate': 0.017850705608912134, 'min_child_weight': 5, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 5.5, 'threshold': 0.25}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  63%|██████▎   | 19/30 [30:48<21:27, 117.08s/it]

[I 2025-11-28 20:35:48,157] Trial 18 finished with value: 0.9952746917080426 and parameters: {'max_depth': 10, 'n_estimators': 350, 'learning_rate': 0.035984569744578344, 'min_child_weight': 7, 'subsample': 0.9, 'colsample_bytree': 0.95, 'scale_pos_weight': 7.0, 'threshold': 0.2}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  67%|██████▋   | 20/30 [33:02<20:21, 122.12s/it]

[I 2025-11-28 20:38:02,028] Trial 19 finished with value: 0.9919598704855804 and parameters: {'max_depth': 9, 'n_estimators': 450, 'learning_rate': 0.07097783577505475, 'min_child_weight': 4, 'subsample': 0.75, 'colsample_bytree': 0.75, 'scale_pos_weight': 9.0, 'threshold': 0.4}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  70%|███████   | 21/30 [34:01<15:31, 103.46s/it]

[I 2025-11-28 20:39:01,968] Trial 20 finished with value: 0.9933932110138112 and parameters: {'max_depth': 6, 'n_estimators': 350, 'learning_rate': 0.017853715956588852, 'min_child_weight': 3, 'subsample': 0.8, 'colsample_bytree': 0.85, 'scale_pos_weight': 4.5, 'threshold': 0.30000000000000004}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  73%|███████▎  | 22/30 [36:35<15:47, 118.39s/it]

[I 2025-11-28 20:41:35,177] Trial 21 finished with value: 0.9951917637272952 and parameters: {'max_depth': 10, 'n_estimators': 350, 'learning_rate': 0.03663285847574814, 'min_child_weight': 7, 'subsample': 0.9, 'colsample_bytree': 0.95, 'scale_pos_weight': 7.0, 'threshold': 0.2}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  77%|███████▋  | 23/30 [39:11<15:07, 129.69s/it]

[I 2025-11-28 20:44:11,223] Trial 22 finished with value: 0.9947437801991201 and parameters: {'max_depth': 11, 'n_estimators': 250, 'learning_rate': 0.03337909106639233, 'min_child_weight': 7, 'subsample': 0.95, 'colsample_bytree': 0.95, 'scale_pos_weight': 6.0, 'threshold': 0.2}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 13. Best value: 0.996157:  80%|████████  | 24/30 [41:02<12:25, 124.20s/it]

[I 2025-11-28 20:46:02,627] Trial 23 finished with value: 0.993985820204212 and parameters: {'max_depth': 9, 'n_estimators': 400, 'learning_rate': 0.06543322578738332, 'min_child_weight': 6, 'subsample': 0.85, 'colsample_bytree': 0.95, 'scale_pos_weight': 8.0, 'threshold': 0.25}. Best is trial 13 with value: 0.9961567108186926.


Best trial: 24. Best value: 0.996357:  83%|████████▎ | 25/30 [42:46<09:51, 118.23s/it]

[I 2025-11-28 20:47:46,925] Trial 24 finished with value: 0.9963570532080941 and parameters: {'max_depth': 8, 'n_estimators': 350, 'learning_rate': 0.02267537876243619, 'min_child_weight': 5, 'subsample': 0.95, 'colsample_bytree': 0.85, 'scale_pos_weight': 7.5, 'threshold': 0.2}. Best is trial 24 with value: 0.9963570532080941.


Best trial: 24. Best value: 0.996357:  87%|████████▋ | 26/30 [44:57<08:07, 121.82s/it]

[I 2025-11-28 20:49:57,124] Trial 25 finished with value: 0.9951904310004636 and parameters: {'max_depth': 8, 'n_estimators': 450, 'learning_rate': 0.021451412014512835, 'min_child_weight': 5, 'subsample': 0.95, 'colsample_bytree': 0.85, 'scale_pos_weight': 5.5, 'threshold': 0.25}. Best is trial 24 with value: 0.9963570532080941.


Best trial: 24. Best value: 0.996357:  90%|█████████ | 27/30 [46:50<05:58, 119.37s/it]

[I 2025-11-28 20:51:50,786] Trial 26 finished with value: 0.9962903581273259 and parameters: {'max_depth': 7, 'n_estimators': 500, 'learning_rate': 0.014718769152251596, 'min_child_weight': 3, 'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': 7.5, 'threshold': 0.2}. Best is trial 24 with value: 0.9963570532080941.


Best trial: 24. Best value: 0.996357:  93%|█████████▎| 28/30 [47:22<03:06, 93.06s/it] 

[I 2025-11-28 20:52:22,448] Trial 27 finished with value: 0.9885684102910632 and parameters: {'max_depth': 5, 'n_estimators': 200, 'learning_rate': 0.014299942001980303, 'min_child_weight': 3, 'subsample': 0.7, 'colsample_bytree': 0.75, 'scale_pos_weight': 8.5, 'threshold': 0.45}. Best is trial 24 with value: 0.9963570532080941.


Best trial: 24. Best value: 0.996357:  97%|█████████▋| 29/30 [49:51<01:49, 109.91s/it]

[I 2025-11-28 20:54:51,693] Trial 28 finished with value: 0.9963364364069747 and parameters: {'max_depth': 8, 'n_estimators': 500, 'learning_rate': 0.013423123506617844, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.85, 'scale_pos_weight': 8.0, 'threshold': 0.2}. Best is trial 24 with value: 0.9963570532080941.


Best trial: 24. Best value: 0.996357: 100%|██████████| 30/30 [51:20<00:00, 102.69s/it]

[I 2025-11-28 20:56:20,634] Trial 29 finished with value: 0.99581954985735 and parameters: {'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.013975013274515223, 'min_child_weight': 1, 'subsample': 0.65, 'colsample_bytree': 0.8, 'scale_pos_weight': 7.5, 'threshold': 0.2}. Best is trial 24 with value: 0.9963570532080941.

Best score: 0.9964
Recall: 0.9965
ROC-AUC: 0.9962
Precision: 0.8070
F1: 0.8918

Best params:
  max_depth: 8
  n_estimators: 350
  learning_rate: 0.02267537876243619
  min_child_weight: 5
  subsample: 0.95
  colsample_bytree: 0.85
  scale_pos_weight: 7.5
  threshold: 0.2





In [11]:
fig = optuna.visualization.plot_optimization_history(xgb_study)
fig.show()

fig = optuna.visualization.plot_param_importances(xgb_study)
fig.show()

## MLP Tuning

In [None]:
from hybrid_mlp_model import HybridMLPClassifier

def mlp_objective(trial):
    params = {
        'feature_hidden_dim': trial.suggest_categorical('feature_hidden_dim', [16, 32, 64, 128]),
        'dropout': trial.suggest_float('dropout', 0.1, 0.5, step=0.1),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128]),
        'pos_weight': trial.suggest_float('pos_weight', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }
    
    X_emb_train_fit, X_emb_val, X_feat_train_fit, X_feat_val, y_train_fit, y_val = train_test_split(
        X_emb_train, features_train, y_train,
        test_size=0.2,
        random_state=42,
        stratify=y_train
    )
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = HybridMLPClassifier(
        embedding_dim=config["embedding_dim"],
        num_features=len(numeric_features),
        feature_hidden_dim=params['feature_hidden_dim'],
        dropout=params['dropout']
    ).to(device)
    
    train_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_train_fit),
            torch.FloatTensor(X_feat_train_fit),
            torch.FloatTensor(y_train_fit)
        ),
        batch_size=params['batch_size'],
        shuffle=True
    )
    
    val_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_val),
            torch.FloatTensor(X_feat_val),
            torch.FloatTensor(y_val)
        ),
        batch_size=params['batch_size'],
        shuffle=False
    )
    
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=params['lr'],
        weight_decay=params['weight_decay']
    )
    
    criterion = nn.BCEWithLogitsLoss(
        pos_weight=torch.tensor([params['pos_weight']], device=device)
    )
    
    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0
    
    for epoch in range(30):
        model.train()
        for embeddings, features, labels in train_loader:
            embeddings, features, labels = embeddings.to(device), features.to(device), labels.to(device)
            
            optimizer.zero_grad()
            logits = model(embeddings, features).squeeze()
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for embeddings, features, labels in val_loader:
                embeddings, features, labels = embeddings.to(device), features.to(device), labels.to(device)
                logits = model(embeddings, features).squeeze()
                val_loss += criterion(logits, labels).item()
        
        val_loss /= len(val_loader)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    test_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_test),
            torch.FloatTensor(features_test),
            torch.FloatTensor(y_test)
        ),
        batch_size=params['batch_size'],
        shuffle=False
    )
    
    model.eval()
    test_probs = []
    test_labels = []
    
    with torch.no_grad():
        for embeddings, features, labels in test_loader:
            embeddings, features = embeddings.to(device), features.to(device)
            logits = model(embeddings, features).squeeze()
            probs = torch.sigmoid(logits)
            
            if probs.dim() == 0:
                test_probs.append(probs.cpu().item())
                test_labels.append(labels.item())
            else:
                test_probs.extend(probs.cpu().numpy())
                test_labels.extend(labels.numpy())
    
    test_probs = np.array(test_probs)
    test_labels = np.array(test_labels)
    test_preds = (test_probs >= params['threshold']).astype(int)
    
    recall = recall_score(test_labels, test_preds, zero_division=0)
    precision = precision_score(test_labels, test_preds, zero_division=0)
    f1 = f1_score(test_labels, test_preds, zero_division=0)
    roc_auc = roc_auc_score(test_labels, test_probs)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    
    return 0.5 * recall + 0.5 * roc_auc

In [13]:
print("Starting MLP tuning...")

wandb_callback_mlp = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "mlp"]}
)

mlp_study = optuna.create_study(
    direction="maximize",
    study_name="mlp-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

mlp_study.optimize(
    mlp_objective,
    n_trials=config["n_trials_mlp"],
    callbacks=[wandb_callback_mlp],
    show_progress_bar=True
)

print(f"\nBest score: {mlp_study.best_trial.value:.4f}")
print(f"Recall: {mlp_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {mlp_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {mlp_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {mlp_study.best_trial.user_attrs['f1_score']:.4f}")
print("\nBest params:")
for k, v in mlp_study.best_trial.params.items():
    print(f"  {k}: {v}")

Starting MLP tuning...


0,1
colsample_bytree,▂▂▂▁▁▅█▃█▁▅▄▁▆▆▆▆▅▇▄▅▇▇▇▅▅▅▄▅▅
learning_rate,▄▁▂▁█▁█▁▁▄▁▃▄▁▁▁▂▁▂▃▁▂▂▂▁▁▁▁▁▁
max_depth,▃▆▃▄▁▂▅▁▃▁█▆▄▅▆▅█▄▆▆▃▆▇▆▅▅▄▃▅▃
min_child_weight,▅█▃▅▇█▆▃▇▆▁▅▆▄▃▄▂▄▆▃▃▆▆▅▄▄▃▃▁▁
n_estimators,█▆▅▇█▅▂▂▅▇▆▆▇▅▄▅▄▁▅▇▅▅▄▆▅▇█▃██
objective_score,▅▇▇█▇▄▅▁▇▇▆▇▇█▇▇▆▆█▆▇█▇▇███▅██
scale_pos_weight,▁▂▃▅▆▃▅▇▆▃█▄▄▅▅▅█▄▆▇▃▆▅▆▆▄▆▇▆▆
subsample,▂▇▅▅▃▃█▄▁▆▅▆▅▅▅▄▅█▆▄▅▆▇▅▇▇▅▃▅▂
threshold,█▂▃▂▄▄█▃▂▁▆▁▁▁▂▃▆▂▁▆▃▁▁▂▁▂▁▇▁▁

0,1
colsample_bytree,0.8
learning_rate,0.01398
max_depth,6.0
min_child_weight,1.0
n_estimators,500.0
objective_score,0.99582
scale_pos_weight,7.5
subsample,0.65
threshold,0.2


[I 2025-11-28 20:56:23,043] A new study created in memory with name: mlp-tuning
Best trial: 0. Best value: 0.987783:   5%|▌         | 1/20 [01:46<33:39, 106.31s/it]

[I 2025-11-28 20:58:09,350] Trial 0 finished with value: 0.9877833291093334 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.1, 'lr': 0.00020511104188433984, 'weight_decay': 1.3066739238053272e-05, 'batch_size': 32, 'pos_weight': 1.0, 'threshold': 0.5}. Best is trial 0 with value: 0.9877833291093334.


Best trial: 1. Best value: 0.994379:  10%|█         | 2/20 [02:30<20:54, 69.71s/it] 

[I 2025-11-28 20:58:53,440] Trial 1 finished with value: 0.9943791234254895 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.2, 'lr': 0.0011207606211860567, 'weight_decay': 7.309539835912905e-05, 'batch_size': 64, 'pos_weight': 3.5, 'threshold': 0.30000000000000004}. Best is trial 1 with value: 0.9943791234254895.


Best trial: 1. Best value: 0.994379:  15%|█▌        | 3/20 [03:42<20:03, 70.79s/it]

[I 2025-11-28 21:00:05,509] Trial 2 finished with value: 0.9935766306441438 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.30000000000000004, 'lr': 0.0001238513729886094, 'weight_decay': 0.000164092867306479, 'batch_size': 128, 'pos_weight': 10.0, 'threshold': 0.45}. Best is trial 1 with value: 0.9943791234254895.


Best trial: 1. Best value: 0.994379:  20%|██        | 4/20 [04:39<17:24, 65.26s/it]

[I 2025-11-28 21:01:02,296] Trial 3 finished with value: 0.9931151162184788 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.1, 'lr': 0.0009780337016659412, 'weight_decay': 1.1715937392307055e-05, 'batch_size': 32, 'pos_weight': 3.5, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9943791234254895.


Best trial: 1. Best value: 0.994379:  25%|██▌       | 5/20 [06:17<19:19, 77.30s/it]

[I 2025-11-28 21:02:40,954] Trial 4 finished with value: 0.9863474366898627 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.006161049539380964, 'weight_decay': 0.0001569639638866114, 'batch_size': 32, 'pos_weight': 1.0, 'threshold': 0.30000000000000004}. Best is trial 1 with value: 0.9943791234254895.


Best trial: 1. Best value: 0.994379:  30%|███       | 6/20 [06:41<13:48, 59.16s/it]

[I 2025-11-28 21:03:04,902] Trial 5 finished with value: 0.9932399931142013 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.2, 'lr': 0.001217284708112243, 'weight_decay': 1.913588048769229e-05, 'batch_size': 128, 'pos_weight': 8.0, 'threshold': 0.25}. Best is trial 1 with value: 0.9943791234254895.


Best trial: 1. Best value: 0.994379:  35%|███▌      | 7/20 [08:40<17:02, 78.67s/it]

[I 2025-11-28 21:05:03,737] Trial 6 finished with value: 0.9937314775771386 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.4, 'lr': 0.00014063366777718192, 'weight_decay': 5.211124595788268e-05, 'batch_size': 64, 'pos_weight': 4.0, 'threshold': 0.2}. Best is trial 1 with value: 0.9943791234254895.


Best trial: 7. Best value: 0.994647:  40%|████      | 8/20 [09:36<14:17, 71.43s/it]

[I 2025-11-28 21:05:59,669] Trial 7 finished with value: 0.994646598828913 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.0008798929749689024, 'weight_decay': 1.7345566642360933e-05, 'batch_size': 64, 'pos_weight': 8.0, 'threshold': 0.35000000000000003}. Best is trial 7 with value: 0.994646598828913.


Best trial: 7. Best value: 0.994647:  45%|████▌     | 9/20 [10:13<11:08, 60.77s/it]

[I 2025-11-28 21:06:36,990] Trial 8 finished with value: 0.9925745936736488 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.1, 'lr': 0.0018742210985555703, 'weight_decay': 4.253162363790868e-05, 'batch_size': 64, 'pos_weight': 4.5, 'threshold': 0.45}. Best is trial 7 with value: 0.994646598828913.


Best trial: 7. Best value: 0.994647:  50%|█████     | 10/20 [11:19<10:21, 62.20s/it]

[I 2025-11-28 21:07:42,386] Trial 9 finished with value: 0.9945450759803101 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.004132765459466363, 'weight_decay': 0.00018484491720988634, 'batch_size': 32, 'pos_weight': 9.0, 'threshold': 0.35000000000000003}. Best is trial 7 with value: 0.994646598828913.


Best trial: 7. Best value: 0.994647:  55%|█████▌    | 11/20 [13:11<11:36, 77.38s/it]

[I 2025-11-28 21:09:34,194] Trial 10 finished with value: 0.993400152027452 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.00041246625441929943, 'weight_decay': 0.0007321173243252594, 'batch_size': 64, 'pos_weight': 7.0, 'threshold': 0.4}. Best is trial 7 with value: 0.994646598828913.


Best trial: 7. Best value: 0.994647:  60%|██████    | 12/20 [14:30<10:24, 78.04s/it]

[I 2025-11-28 21:10:53,734] Trial 11 finished with value: 0.9873639541746644 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.009782049115231381, 'weight_decay': 0.0004245308545737427, 'batch_size': 32, 'pos_weight': 10.0, 'threshold': 0.35000000000000003}. Best is trial 7 with value: 0.994646598828913.


Best trial: 7. Best value: 0.994647:  65%|██████▌   | 13/20 [16:22<10:18, 88.34s/it]

[I 2025-11-28 21:12:45,773] Trial 12 finished with value: 0.9945401477627263 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.003415948817661854, 'weight_decay': 0.0002612846728330453, 'batch_size': 64, 'pos_weight': 8.0, 'threshold': 0.4}. Best is trial 7 with value: 0.994646598828913.


Best trial: 7. Best value: 0.994647:  70%|███████   | 14/20 [18:22<09:46, 97.69s/it]

[I 2025-11-28 21:14:45,082] Trial 13 finished with value: 0.9932002788988694 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0005090873160060656, 'weight_decay': 2.7156012496237452e-05, 'batch_size': 32, 'pos_weight': 6.5, 'threshold': 0.30000000000000004}. Best is trial 7 with value: 0.994646598828913.


Best trial: 14. Best value: 0.995172:  75%|███████▌  | 15/20 [19:16<07:03, 84.61s/it]

[I 2025-11-28 21:15:39,377] Trial 14 finished with value: 0.9951721174280361 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.0028961603572682073, 'weight_decay': 0.00011146976827416427, 'batch_size': 128, 'pos_weight': 8.5, 'threshold': 0.4}. Best is trial 14 with value: 0.9951721174280361.


Best trial: 14. Best value: 0.995172:  80%|████████  | 16/20 [19:52<04:40, 70.12s/it]

[I 2025-11-28 21:16:15,836] Trial 15 finished with value: 0.9945495231892381 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.4, 'lr': 0.0020429355772952976, 'weight_decay': 8.719857188624702e-05, 'batch_size': 128, 'pos_weight': 6.0, 'threshold': 0.4}. Best is trial 14 with value: 0.9951721174280361.


Best trial: 14. Best value: 0.995172:  85%|████████▌ | 17/20 [20:31<03:02, 60.82s/it]

[I 2025-11-28 21:16:55,031] Trial 16 finished with value: 0.9920724480487157 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.30000000000000004, 'lr': 0.00043822823657896544, 'weight_decay': 3.4178773742237304e-05, 'batch_size': 128, 'pos_weight': 8.0, 'threshold': 0.5}. Best is trial 14 with value: 0.9951721174280361.


Best trial: 14. Best value: 0.995172:  90%|█████████ | 18/20 [21:11<01:49, 54.55s/it]

[I 2025-11-28 21:17:34,983] Trial 17 finished with value: 0.9939079659836945 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.0007001957123845261, 'weight_decay': 9.163270007596327e-05, 'batch_size': 128, 'pos_weight': 9.0, 'threshold': 0.45}. Best is trial 14 with value: 0.9951721174280361.


Best trial: 14. Best value: 0.995172:  95%|█████████▌| 19/20 [22:16<00:57, 57.46s/it]

[I 2025-11-28 21:18:39,236] Trial 18 finished with value: 0.9944286242416249 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.4, 'lr': 0.002360305876179493, 'weight_decay': 2.159691137460912e-05, 'batch_size': 64, 'pos_weight': 7.0, 'threshold': 0.25}. Best is trial 14 with value: 0.9951721174280361.


Best trial: 14. Best value: 0.995172: 100%|██████████| 20/20 [22:46<00:00, 68.31s/it]

[I 2025-11-28 21:19:09,214] Trial 19 finished with value: 0.9931517903546814 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.30000000000000004, 'lr': 0.00362148327250209, 'weight_decay': 0.00032946664182343565, 'batch_size': 128, 'pos_weight': 5.0, 'threshold': 0.4}. Best is trial 14 with value: 0.9951721174280361.

Best score: 0.9952
Recall: 0.9929
ROC-AUC: 0.9974
Precision: 0.9054
F1: 0.9471

Best params:
  feature_hidden_dim: 64
  dropout: 0.5
  lr: 0.0028961603572682073
  weight_decay: 0.00011146976827416427
  batch_size: 128
  pos_weight: 8.5
  threshold: 0.4





In [14]:
fig = optuna.visualization.plot_optimization_history(mlp_study)
fig.show()

fig = optuna.visualization.plot_param_importances(mlp_study)
fig.show()

## Save Results

In [15]:
results = {
    'tfidf': {
        'best_score': tfidf_study.best_trial.value,
        'recall': tfidf_study.best_trial.user_attrs['recall'],
        'roc_auc': tfidf_study.best_trial.user_attrs['roc_auc'],
        'precision': tfidf_study.best_trial.user_attrs['precision'],
        'f1_score': tfidf_study.best_trial.user_attrs['f1_score'],
        'best_params': tfidf_study.best_trial.params
    },
    'xgboost': {
        'best_score': xgb_study.best_trial.value,
        'recall': xgb_study.best_trial.user_attrs['recall'],
        'roc_auc': xgb_study.best_trial.user_attrs['roc_auc'],
        'precision': xgb_study.best_trial.user_attrs['precision'],
        'f1_score': xgb_study.best_trial.user_attrs['f1_score'],
        'best_params': xgb_study.best_trial.params
    },
    'mlp': {
        'best_score': mlp_study.best_trial.value,
        'recall': mlp_study.best_trial.user_attrs['recall'],
        'roc_auc': mlp_study.best_trial.user_attrs['roc_auc'],
        'precision': mlp_study.best_trial.user_attrs['precision'],
        'f1_score': mlp_study.best_trial.user_attrs['f1_score'],
        'best_params': mlp_study.best_trial.params
    }
}

os.makedirs("../../output/tuning_results", exist_ok=True)

with open('../../output/tuning_results/optuna_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved to output/tuning_results/optuna_results.json")
print("\nSummary:")
for model, res in results.items():
    print(f"\n{model.upper()}:")
    print(f"  Score (0.5*Recall + 0.5*ROC-AUC): {res['best_score']:.4f}")
    print(f"  Recall: {res['recall']:.4f}")
    print(f"  ROC-AUC: {res['roc_auc']:.4f}")
    print(f"  Precision: {res['precision']:.4f}")
    print(f"  F1 Score: {res['f1_score']:.4f}")

Results saved to output/tuning_results/optuna_results.json

Summary:

TFIDF:
  Score (0.5*Recall + 0.5*ROC-AUC): 0.9980
  Recall: 0.9984
  ROC-AUC: 0.9975
  Precision: 0.8430
  F1 Score: 0.9141

XGBOOST:
  Score (0.5*Recall + 0.5*ROC-AUC): 0.9964
  Recall: 0.9965
  ROC-AUC: 0.9962
  Precision: 0.8070
  F1 Score: 0.8918

MLP:
  Score (0.5*Recall + 0.5*ROC-AUC): 0.9952
  Recall: 0.9929
  ROC-AUC: 0.9974
  Precision: 0.9054
  F1 Score: 0.9471
