# Hyperparameter Tuning for Phishing Detection Models

Optuna-based hyperparameter optimization for:
- TF-IDF + Logistic Regression
- XGBoost Hybrid
- MLP Hybrid

**Objective:** Maximize Recall + ROC-AUC (catch phishing + good ranking)

## Setup and Imports

In [2]:
import os
import sys
import json
import pickle
from datetime import datetime
from pathlib import Path
import numpy as np
import polars as pl
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
     precision_score, recall_score, f1_score, roc_auc_score,
)
from sentence_transformers import SentenceTransformer
import xgboost as xgb
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback

sys.path.append(str(Path.cwd().parent.parent))

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
print(f"Optuna version: {optuna.__version__}")

PyTorch version: 2.5.1+cu121
CUDA available: True
Device: cuda
Optuna version: 4.6.0


## Configuration

In [17]:
config = {
    "random_seed": 42,
    "data_source": "../../emails_v6.parquet",
    "numeric_features": [
        "sender_domain_entropy", "has_attachment",
        "spf_flag_missing", "dkim_flag_missing",
        "num_links", "subject_length", "body_length", "keyword_count", 
        "num_received_headers", "num_exclamation_marks", 
        "num_malicious_links"
    ],
    "embedding_model": "all-MiniLM-L6-v2",
    "embedding_dim": 384,
    "n_trials_tfidf": 30,
    "n_trials_xgb": 30,
    "n_trials_mlp": 30
}

In [4]:
# Load dataset
print("Loading dataset...")
df = pl.read_parquet("../../emails_v6.parquet")
pl.Config.set_tbl_rows(-1)
print(df.select("source").unique())
print(f"Total records: {len(df):,}")
print(f"Columns: {df.columns}")

# Check class distribution
phishing_count = df.filter(pl.col("phishing") == 1).shape[0]
legit_count = df.filter(pl.col("phishing") == 0).shape[0]

print(f"\nClass distribution:")
print(f"  Phishing: {phishing_count:,} ({phishing_count/len(df)*100:.2f}%)")
print(f"  Legitimate: {legit_count:,} ({legit_count/len(df)*100:.2f}%)")


from app.data_processing.preprocessing_pipeline import PreprocessingPipeline

df = PreprocessingPipeline().process_pipeline(df)
external_df = df.filter(
    (pl.col("source") == "data/csv_misc/TREC-07.csv") | (pl.col("source") == "phishing-2020")
)

train_pool_df = df.filter(
    (pl.col("source") != "data/csv_misc/TREC-07.csv") & (pl.col("source") != "phishing-2020")
)

print(f"Training pool: {len(train_pool_df):,}")
print(f"External validation: {len(external_df):,}")

train_pool_pd = train_pool_df.to_pandas()

train_df_pd, test_df_pd = train_test_split(
    train_pool_pd, 
    test_size=0.25, 
    random_state=config["random_seed"], 
    stratify=train_pool_pd['phishing']
)

train_df = pl.from_pandas(train_df_pd)
test_df = pl.from_pandas(test_df_pd)

print(f"\nFinal splits:")
print(f"  Train: {len(train_df):,} ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Test: {len(test_df):,} ({len(test_df)/len(df)*100:.1f}%)")
print(f"  External: {len(external_df):,} ({len(external_df)/len(df)*100:.1f}%)")
numeric_features = config["numeric_features"]


features_train = train_df.select(numeric_features).fill_null(0).to_numpy()
features_test = test_df.select(numeric_features).fill_null(0).to_numpy()
y_train = train_df['phishing'].to_numpy()
y_test = test_df['phishing'].to_numpy()


Loading dataset...
shape: (22, 1)
┌────────────────────────────┐
│ source                     │
│ ---                        │
│ str                        │
╞════════════════════════════╡
│ phishing-2013              │
│ phishing-2024              │
│ mbox_andrunik.mbox         │
│ data/csv_misc/TREC-06.csv  │
│ phishing-2015              │
│ phishing-nigerian          │
│ phishing-2018              │
│ phishing-2014              │
│ mbox_gradzki.mbox          │
│ phishing-2016              │
│ phishing-2020              │
│ phishing-2021              │
│ data/csv_misc/Enron.csv    │
│ data/csv_misc/Assassin.csv │
│ data/csv_misc/TREC-07.csv  │
│ phishing-2023              │
│ phishing-2019              │
│ phishing-2017              │
│ data/csv_misc/CEAS-08.csv  │
│ data/csv_misc/Ling.csv     │
│ data/csv_misc/TREC-05.csv  │
│ phishing-2022              │
└────────────────────────────┘
Total records: 212,113
Columns: ['source', 'phishing', 'spf_flag', 'dkim_flag', 'd_flag', 'num_rec

## Generate Embeddings

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer(config["embedding_model"], device=device)

print("Encoding train set...")
X_emb_train = embedding_model.encode(
    train_df['body_subject'].to_list(), 
    show_progress_bar=True, 
    convert_to_numpy=True,
    batch_size=32
)

print("Encoding test set...")
X_emb_test = embedding_model.encode(
    test_df['body_subject'].to_list(), 
    show_progress_bar=True, 
    convert_to_numpy=True,
    batch_size=32
)

print(f"Embeddings shape: {X_emb_train.shape}")

Encoding train set...


Batches: 100%|██████████| 3723/3723 [02:36<00:00, 23.77it/s] 


Encoding test set...


Batches: 100%|██████████| 1241/1241 [00:56<00:00, 22.15it/s] 


Embeddings shape: (119121, 384)


## TF-IDF Tuning

In [6]:
from scipy.sparse import hstack
def tfidf_objective(trial):
    params = {
        'max_features': trial.suggest_int('max_features', 3000, 10000, step=1000),
        'min_df': trial.suggest_int('min_df', 2, 10),
        'max_df': trial.suggest_float('max_df', 0.5, 0.95, step=0.05),
        'ngram_max': trial.suggest_int('ngram_max', 1, 3),
        'C': trial.suggest_float('C', 0.1, 10.0, log=True),
        'class_weight_ratio': trial.suggest_float('class_weight_ratio', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }
    
    vectorizer = TfidfVectorizer(
        max_features=params['max_features'],
        ngram_range=(1, params['ngram_max']),
        min_df=params['min_df'],
        max_df=params['max_df']
    )
    
    X_tfidf_train = vectorizer.fit_transform(train_df['body_subject'].to_list())
    X_tfidf_test = vectorizer.transform(test_df['body_subject'].to_list())
    
    X_combined_train = hstack([X_tfidf_train, features_train])
    X_combined_test = hstack([X_tfidf_test, features_test])
    
    clf = LogisticRegression(
        max_iter=1000,
        C=params['C'],
        class_weight={0: 1.0, 1: params['class_weight_ratio']},
        random_state=42,
        verbose=0
    )
    clf.fit(X_combined_train, y_train)
    
    y_proba = clf.predict_proba(X_combined_test)[:, 1]
    y_pred = (y_proba >= params['threshold']).astype(int)
    
    recall = recall_score(y_test, y_pred, zero_division=0)
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    F2 = (5 * precision * recall) / (4 * precision + recall)
    return F2

In [7]:
print("Starting TF-IDF tuning...")

wandb_callback = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "tfidf"]}
)

tfidf_study = optuna.create_study(
    direction="maximize",
    study_name="tfidf-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

tfidf_study.optimize(
    tfidf_objective,
    n_trials=config["n_trials_tfidf"],
    callbacks=[wandb_callback],
    show_progress_bar=True
)

print(f"\nBest score: {tfidf_study.best_trial.value:.4f}")
print(f"Recall: {tfidf_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {tfidf_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {tfidf_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {tfidf_study.best_trial.user_attrs['f1_score']:.4f}")
print("\nBest params:")
for k, v in tfidf_study.best_trial.params.items():
    print(f"  {k}: {v}")

  wandb_callback = WeightsAndBiasesCallback(


Starting TF-IDF tuning...


[34m[1mwandb[0m: Currently logged in as: [33mlatandu[0m ([33mlatandu-phishstop[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[I 2025-11-30 22:12:12,202] A new study created in memory with name: tfidf-tuning
Best trial: 0. Best value: 0.951436:   3%|▎         | 1/30 [00:58<28:14, 58.43s/it]

[I 2025-11-30 22:13:10,619] Trial 0 finished with value: 0.9514355174808982 and parameters: {'max_features': 5000, 'min_df': 10, 'max_df': 0.8500000000000001, 'ngram_max': 2, 'C': 0.20513382630874505, 'class_weight_ratio': 2.0, 'threshold': 0.2}. Best is trial 0 with value: 0.9514355174808982.


Best trial: 1. Best value: 0.9771:   7%|▋         | 2/30 [01:20<17:19, 37.11s/it]  

[I 2025-11-30 22:13:32,811] Trial 1 finished with value: 0.9770995995103806 and parameters: {'max_features': 9000, 'min_df': 7, 'max_df': 0.8500000000000001, 'ngram_max': 1, 'C': 8.706020878304859, 'class_weight_ratio': 8.5, 'threshold': 0.25}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  10%|█         | 3/30 [03:14<32:24, 72.01s/it]

[I 2025-11-30 22:15:26,349] Trial 2 finished with value: 0.9731583514615504 and parameters: {'max_features': 4000, 'min_df': 3, 'max_df': 0.65, 'ngram_max': 2, 'C': 0.7309539835912913, 'class_weight_ratio': 3.5, 'threshold': 0.4}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  13%|█▎        | 4/30 [04:17<29:41, 68.52s/it]

[I 2025-11-30 22:16:29,525] Trial 3 finished with value: 0.9756607061856902 and parameters: {'max_features': 4000, 'min_df': 4, 'max_df': 0.65, 'ngram_max': 2, 'C': 3.7183641805732095, 'class_weight_ratio': 2.5, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  17%|█▋        | 5/30 [04:40<21:45, 52.21s/it]

[I 2025-11-30 22:16:52,814] Trial 4 finished with value: 0.9646911205966509 and parameters: {'max_features': 7000, 'min_df': 2, 'max_df': 0.8, 'ngram_max': 1, 'C': 0.1349283426801325, 'class_weight_ratio': 10.0, 'threshold': 0.5}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  20%|██        | 6/30 [07:57<40:31, 101.31s/it]

[I 2025-11-30 22:20:09,416] Trial 5 finished with value: 0.9765951352254555 and parameters: {'max_features': 9000, 'min_df': 4, 'max_df': 0.5, 'ngram_max': 3, 'C': 0.7591104805282696, 'class_weight_ratio': 2.0, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  23%|██▎       | 7/30 [10:43<46:54, 122.38s/it]

[I 2025-11-30 22:22:55,194] Trial 6 finished with value: 0.9626495716478959 and parameters: {'max_features': 3000, 'min_df': 10, 'max_df': 0.6, 'ngram_max': 2, 'C': 0.420167205437253, 'class_weight_ratio': 5.5, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  27%|██▋       | 8/30 [14:32<57:26, 156.64s/it]

[I 2025-11-30 22:26:45,194] Trial 7 finished with value: 0.9753842115311189 and parameters: {'max_features': 4000, 'min_df': 10, 'max_df': 0.8500000000000001, 'ngram_max': 3, 'C': 6.161049539380966, 'class_weight_ratio': 6.5, 'threshold': 0.5}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  30%|███       | 9/30 [14:57<40:21, 115.29s/it]

[I 2025-11-30 22:27:09,556] Trial 8 finished with value: 0.971863353297691 and parameters: {'max_features': 3000, 'min_df': 3, 'max_df': 0.5, 'ngram_max': 1, 'C': 0.59890036722543, 'class_weight_ratio': 3.5, 'threshold': 0.45}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  33%|███▎      | 10/30 [15:21<29:04, 87.21s/it]

[I 2025-11-30 22:27:33,892] Trial 9 finished with value: 0.9759819532908705 and parameters: {'max_features': 5000, 'min_df': 4, 'max_df': 0.75, 'ngram_max': 1, 'C': 4.02155452669029, 'class_weight_ratio': 1.5, 'threshold': 0.5}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  37%|███▋      | 11/30 [15:46<21:33, 68.08s/it]

[I 2025-11-30 22:27:58,589] Trial 10 finished with value: 0.9669866679185053 and parameters: {'max_features': 10000, 'min_df': 7, 'max_df': 0.95, 'ngram_max': 1, 'C': 2.012487082975094, 'class_weight_ratio': 9.0, 'threshold': 0.2}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  40%|████      | 12/30 [21:17<44:27, 148.18s/it]

[I 2025-11-30 22:33:29,989] Trial 11 finished with value: 0.9702459479834152 and parameters: {'max_features': 9000, 'min_df': 7, 'max_df': 0.5, 'ngram_max': 3, 'C': 1.6132581779288595, 'class_weight_ratio': 7.5, 'threshold': 0.25}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 1. Best value: 0.9771:  43%|████▎     | 13/30 [24:22<45:09, 159.39s/it]

[I 2025-11-30 22:36:35,155] Trial 12 finished with value: 0.9734602317790643 and parameters: {'max_features': 8000, 'min_df': 6, 'max_df': 0.95, 'ngram_max': 3, 'C': 1.328280701133037, 'class_weight_ratio': 5.0, 'threshold': 0.30000000000000004}. Best is trial 1 with value: 0.9770995995103806.


Best trial: 13. Best value: 0.977659:  47%|████▋     | 14/30 [26:42<40:52, 153.28s/it]

[I 2025-11-30 22:38:54,312] Trial 13 finished with value: 0.9776592832402966 and parameters: {'max_features': 10000, 'min_df': 8, 'max_df': 0.7, 'ngram_max': 3, 'C': 8.538648795388465, 'class_weight_ratio': 8.0, 'threshold': 0.30000000000000004}. Best is trial 13 with value: 0.9776592832402966.


Best trial: 13. Best value: 0.977659:  50%|█████     | 15/30 [27:50<31:56, 127.75s/it]

[I 2025-11-30 22:40:02,889] Trial 14 finished with value: 0.9774011299435028 and parameters: {'max_features': 10000, 'min_df': 8, 'max_df': 0.75, 'ngram_max': 2, 'C': 7.057572469422886, 'class_weight_ratio': 8.0, 'threshold': 0.25}. Best is trial 13 with value: 0.9776592832402966.


Best trial: 15. Best value: 0.979447:  53%|█████▎    | 16/30 [29:46<28:58, 124.19s/it]

[I 2025-11-30 22:41:58,825] Trial 15 finished with value: 0.9794466591869374 and parameters: {'max_features': 10000, 'min_df': 8, 'max_df': 0.7, 'ngram_max': 2, 'C': 9.818247569037462, 'class_weight_ratio': 7.0, 'threshold': 0.30000000000000004}. Best is trial 15 with value: 0.9794466591869374.


Best trial: 15. Best value: 0.979447:  57%|█████▋    | 17/30 [35:33<41:25, 191.23s/it]

[I 2025-11-30 22:47:45,935] Trial 16 finished with value: 0.973958271625416 and parameters: {'max_features': 7000, 'min_df': 9, 'max_df': 0.65, 'ngram_max': 3, 'C': 2.963198436306426, 'class_weight_ratio': 6.5, 'threshold': 0.30000000000000004}. Best is trial 15 with value: 0.9794466591869374.


Best trial: 15. Best value: 0.979447:  60%|██████    | 18/30 [37:40<34:21, 171.80s/it]

[I 2025-11-30 22:49:52,536] Trial 17 finished with value: 0.97805889079474 and parameters: {'max_features': 8000, 'min_df': 8, 'max_df': 0.7, 'ngram_max': 2, 'C': 9.016863555711494, 'class_weight_ratio': 7.0, 'threshold': 0.30000000000000004}. Best is trial 15 with value: 0.9794466591869374.


Best trial: 18. Best value: 0.979474:  63%|██████▎   | 19/30 [38:53<26:04, 142.25s/it]

[I 2025-11-30 22:51:05,953] Trial 18 finished with value: 0.9794743817836631 and parameters: {'max_features': 8000, 'min_df': 6, 'max_df': 0.6, 'ngram_max': 2, 'C': 4.757821948952286, 'class_weight_ratio': 6.5, 'threshold': 0.4}. Best is trial 18 with value: 0.9794743817836631.


Best trial: 19. Best value: 0.979904:  67%|██████▋   | 20/30 [39:58<19:48, 118.86s/it]

[I 2025-11-30 22:52:10,302] Trial 19 finished with value: 0.9799043062200957 and parameters: {'max_features': 8000, 'min_df': 6, 'max_df': 0.6, 'ngram_max': 2, 'C': 4.842855693488871, 'class_weight_ratio': 4.5, 'threshold': 0.4}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  70%|███████   | 21/30 [41:04<15:26, 102.98s/it]

[I 2025-11-30 22:53:16,252] Trial 20 finished with value: 0.9781402508143712 and parameters: {'max_features': 8000, 'min_df': 5, 'max_df': 0.55, 'ngram_max': 2, 'C': 2.4772468728975627, 'class_weight_ratio': 4.5, 'threshold': 0.4}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  73%|███████▎  | 22/30 [42:18<12:35, 94.43s/it] 

[I 2025-11-30 22:54:30,731] Trial 21 finished with value: 0.9777125889886596 and parameters: {'max_features': 6000, 'min_df': 6, 'max_df': 0.6, 'ngram_max': 2, 'C': 4.716082379360104, 'class_weight_ratio': 6.0, 'threshold': 0.4}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  77%|███████▋  | 23/30 [43:23<10:00, 85.72s/it]

[I 2025-11-30 22:55:36,124] Trial 22 finished with value: 0.979526559168124 and parameters: {'max_features': 7000, 'min_df': 5, 'max_df': 0.6, 'ngram_max': 2, 'C': 5.382288479831059, 'class_weight_ratio': 4.0, 'threshold': 0.45}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  80%|████████  | 24/30 [44:29<07:58, 79.81s/it]

[I 2025-11-30 22:56:42,164] Trial 23 finished with value: 0.9797082689103831 and parameters: {'max_features': 7000, 'min_df': 5, 'max_df': 0.6, 'ngram_max': 2, 'C': 5.181361715887068, 'class_weight_ratio': 4.0, 'threshold': 0.45}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  83%|████████▎ | 25/30 [45:49<06:38, 79.70s/it]

[I 2025-11-30 22:58:01,611] Trial 24 finished with value: 0.9788109737859082 and parameters: {'max_features': 6000, 'min_df': 5, 'max_df': 0.55, 'ngram_max': 2, 'C': 2.9308958135228496, 'class_weight_ratio': 4.0, 'threshold': 0.45}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  87%|████████▋ | 26/30 [46:54<05:00, 75.24s/it]

[I 2025-11-30 22:59:06,437] Trial 25 finished with value: 0.9788397783442844 and parameters: {'max_features': 7000, 'min_df': 5, 'max_df': 0.55, 'ngram_max': 2, 'C': 1.1633870846893464, 'class_weight_ratio': 3.0, 'threshold': 0.45}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  90%|█████████ | 27/30 [48:00<03:37, 72.40s/it]

[I 2025-11-30 23:00:12,203] Trial 26 finished with value: 0.9785261394903695 and parameters: {'max_features': 6000, 'min_df': 5, 'max_df': 0.6, 'ngram_max': 2, 'C': 5.430301981461964, 'class_weight_ratio': 5.0, 'threshold': 0.45}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  93%|█████████▎| 28/30 [48:22<01:54, 57.48s/it]

[I 2025-11-30 23:00:34,864] Trial 27 finished with value: 0.97542122108823 and parameters: {'max_features': 7000, 'min_df': 3, 'max_df': 0.65, 'ngram_max': 1, 'C': 2.1450357089870633, 'class_weight_ratio': 1.0, 'threshold': 0.45}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904:  97%|█████████▋| 29/30 [49:29<01:00, 60.21s/it]

[I 2025-11-30 23:01:41,460] Trial 28 finished with value: 0.9792705232578989 and parameters: {'max_features': 8000, 'min_df': 6, 'max_df': 0.55, 'ngram_max': 2, 'C': 3.226677116219035, 'class_weight_ratio': 4.0, 'threshold': 0.4}. Best is trial 19 with value: 0.9799043062200957.


Best trial: 19. Best value: 0.979904: 100%|██████████| 30/30 [50:38<00:00, 101.29s/it]

[I 2025-11-30 23:02:50,927] Trial 29 finished with value: 0.9769762548842803 and parameters: {'max_features': 5000, 'min_df': 4, 'max_df': 0.65, 'ngram_max': 2, 'C': 5.9857926374935815, 'class_weight_ratio': 2.5, 'threshold': 0.5}. Best is trial 19 with value: 0.9799043062200957.

Best score: 0.9799
Recall: 0.9918
ROC-AUC: 0.9974
Precision: 0.9352
F1: 0.9626

Best params:
  max_features: 8000
  min_df: 6
  max_df: 0.6
  ngram_max: 2
  C: 4.842855693488871
  class_weight_ratio: 4.5
  threshold: 0.4





In [14]:
fig = optuna.visualization.plot_optimization_history(tfidf_study)
fig.update_layout(width=800, height=500) 
fig.show()

fig = optuna.visualization.plot_param_importances(tfidf_study)
fig.update_layout(width=800, height=500) 
fig.show()

## XGBoost Tuning

In [9]:
def xgboost_objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }
    
    X_train = np.concatenate([X_emb_train, features_train], axis=1)
    X_test = np.concatenate([X_emb_test, features_test], axis=1)
    
    clf = xgb.XGBClassifier(
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        min_child_weight=params['min_child_weight'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        scale_pos_weight=params['scale_pos_weight'],
        random_state=42,
        eval_metric='logloss',
        early_stopping_rounds=10,
        verbosity=0
    )
    
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    y_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= params['threshold']).astype(int)
    
    recall = recall_score(y_test, y_pred, zero_division=0)
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    
    F2 = (5 * precision * recall) / (4 * precision + recall)
    return F2   

In [10]:
print("Starting XGBoost tuning...")

wandb_callback_xgb = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "xgboost"]}
)

xgb_study = optuna.create_study(
    direction="maximize",
    study_name="xgboost-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

xgb_study.optimize(
    xgboost_objective,
    n_trials=config["n_trials_xgb"],
    callbacks=[wandb_callback_xgb],
    show_progress_bar=True
)

print(f"\nBest score: {xgb_study.best_trial.value:.4f}")
print(f"Recall: {xgb_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {xgb_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {xgb_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {xgb_study.best_trial.user_attrs['f1_score']:.4f}")
print("\nBest params:")
for k, v in xgb_study.best_trial.params.items():
    print(f"  {k}: {v}")

Starting XGBoost tuning...



WeightsAndBiasesCallback is experimental (supported from v2.9.0). The interface can change in the future.



0,1
C,▁▇▁▄▁▁▁▅▁▄▂▂▂▇▆█▃▇▄▄▃▄▅▅▃▂▅▂▃▅
class_weight_ratio,▂▇▃▂█▂▅▅▃▁▇▆▄▆▆▆▅▆▅▄▄▅▃▃▃▃▄▁▃▂
max_df,▆▆▃▃▆▁▃▆▁▅█▁█▄▅▄▃▄▃▃▂▃▃▃▂▂▃▃▂▃
max_features,▃▇▂▂▅▇▁▂▁▃█▇▆███▅▆▆▆▆▄▅▅▄▅▄▅▆▃
min_df,█▅▂▃▁▃██▂▃▅▅▅▆▆▆▇▆▅▅▄▅▄▄▄▄▄▂▅▃
ngram_max,▅▁▅▅▁█▅█▁▁▁███▅▅█▅▅▅▅▅▅▅▅▅▅▁▅▅
objective_score,▁▇▆▇▄▇▄▇▆▇▅▆▆▇▇█▇████▇█████▇█▇
threshold,▁▂▆▄█▄▄█▇█▁▂▃▃▂▃▃▃▆▆▆▆▇▇▇▇▇▇▆█

0,1
C,5.98579
class_weight_ratio,2.5
max_df,0.65
max_features,5000.0
min_df,4.0
ngram_max,2.0
objective_score,0.97698
threshold,0.5


[I 2025-11-30 23:02:55,336] A new study created in memory with name: xgboost-tuning
Best trial: 0. Best value: 0.977166:   3%|▎         | 1/30 [00:44<21:44, 44.98s/it]

[I 2025-11-30 23:03:40,308] Trial 0 finished with value: 0.9771659644534842 and parameters: {'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.1205712628744377, 'min_child_weight': 6, 'subsample': 0.65, 'colsample_bytree': 0.65, 'scale_pos_weight': 1.5, 'threshold': 0.5}. Best is trial 0 with value: 0.9771659644534842.


Best trial: 0. Best value: 0.977166:   7%|▋         | 2/30 [02:04<30:21, 65.07s/it]

[I 2025-11-30 23:04:59,441] Trial 1 finished with value: 0.9559029234858024 and parameters: {'max_depth': 9, 'n_estimators': 400, 'learning_rate': 0.010725209743171996, 'min_child_weight': 10, 'subsample': 0.95, 'colsample_bytree': 0.65, 'scale_pos_weight': 2.5, 'threshold': 0.25}. Best is trial 0 with value: 0.9771659644534842.


Best trial: 0. Best value: 0.977166:  10%|█         | 3/30 [02:33<21:51, 48.58s/it]

[I 2025-11-30 23:05:28,399] Trial 2 finished with value: 0.9665124840704206 and parameters: {'max_depth': 6, 'n_estimators': 300, 'learning_rate': 0.04345454109729477, 'min_child_weight': 3, 'subsample': 0.85, 'colsample_bytree': 0.65, 'scale_pos_weight': 3.5, 'threshold': 0.30000000000000004}. Best is trial 0 with value: 0.9771659644534842.


Best trial: 0. Best value: 0.977166:  13%|█▎        | 4/30 [03:28<22:09, 51.15s/it]

[I 2025-11-30 23:06:23,494] Trial 3 finished with value: 0.9532032264045986 and parameters: {'max_depth': 7, 'n_estimators': 450, 'learning_rate': 0.019721610970574007, 'min_child_weight': 6, 'subsample': 0.85, 'colsample_bytree': 0.6, 'scale_pos_weight': 6.5, 'threshold': 0.25}. Best is trial 0 with value: 0.9771659644534842.


Best trial: 0. Best value: 0.977166:  17%|█▋        | 5/30 [03:54<17:30, 42.04s/it]

[I 2025-11-30 23:06:49,374] Trial 4 finished with value: 0.9728065796667379 and parameters: {'max_depth': 3, 'n_estimators': 500, 'learning_rate': 0.26690431824362526, 'min_child_weight': 9, 'subsample': 0.7, 'colsample_bytree': 0.6, 'scale_pos_weight': 7.5, 'threshold': 0.35000000000000003}. Best is trial 0 with value: 0.9771659644534842.


Best trial: 0. Best value: 0.977166:  20%|██        | 6/30 [04:18<14:27, 36.14s/it]

[I 2025-11-30 23:07:14,071] Trial 5 finished with value: 0.8984451987298807 and parameters: {'max_depth': 4, 'n_estimators': 300, 'learning_rate': 0.011240768803005551, 'min_child_weight': 10, 'subsample': 0.7, 'colsample_bytree': 0.85, 'scale_pos_weight': 3.5, 'threshold': 0.35000000000000003}. Best is trial 0 with value: 0.9771659644534842.


Best trial: 6. Best value: 0.978346:  23%|██▎       | 7/30 [04:43<12:26, 32.46s/it]

[I 2025-11-30 23:07:38,949] Trial 6 finished with value: 0.978346147811078 and parameters: {'max_depth': 8, 'n_estimators': 150, 'learning_rate': 0.27051668818999286, 'min_child_weight': 8, 'subsample': 1.0, 'colsample_bytree': 1.0, 'scale_pos_weight': 6.5, 'threshold': 0.5}. Best is trial 6 with value: 0.978346147811078.


Best trial: 6. Best value: 0.978346:  27%|██▋       | 8/30 [04:54<09:23, 25.61s/it]

[I 2025-11-30 23:07:49,888] Trial 7 finished with value: 0.7858434021501286 and parameters: {'max_depth': 3, 'n_estimators': 150, 'learning_rate': 0.011662890273931383, 'min_child_weight': 4, 'subsample': 0.75, 'colsample_bytree': 0.7, 'scale_pos_weight': 8.5, 'threshold': 0.30000000000000004}. Best is trial 6 with value: 0.978346147811078.


Best trial: 6. Best value: 0.978346:  30%|███       | 9/30 [05:23<09:19, 26.63s/it]

[I 2025-11-30 23:08:18,777] Trial 8 finished with value: 0.8853372559967359 and parameters: {'max_depth': 5, 'n_estimators': 300, 'learning_rate': 0.016149614799999188, 'min_child_weight': 9, 'subsample': 0.6, 'colsample_bytree': 1.0, 'scale_pos_weight': 8.0, 'threshold': 0.25}. Best is trial 6 with value: 0.978346147811078.


Best trial: 6. Best value: 0.978346:  33%|███▎      | 10/30 [05:48<08:41, 26.10s/it]

[I 2025-11-30 23:08:43,667] Trial 9 finished with value: 0.9525961984237364 and parameters: {'max_depth': 3, 'n_estimators': 450, 'learning_rate': 0.11069143219393454, 'min_child_weight': 8, 'subsample': 0.9, 'colsample_bytree': 0.6, 'scale_pos_weight': 4.0, 'threshold': 0.2}. Best is trial 6 with value: 0.978346147811078.


Best trial: 6. Best value: 0.978346:  37%|███▋      | 11/30 [06:31<09:52, 31.17s/it]

[I 2025-11-30 23:09:26,332] Trial 10 finished with value: 0.9741130425328872 and parameters: {'max_depth': 12, 'n_estimators': 100, 'learning_rate': 0.2704729722717776, 'min_child_weight': 1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'scale_pos_weight': 10.0, 'threshold': 0.5}. Best is trial 6 with value: 0.978346147811078.


Best trial: 6. Best value: 0.978346:  40%|████      | 12/30 [07:05<09:37, 32.06s/it]

[I 2025-11-30 23:10:00,430] Trial 11 finished with value: 0.9747764774296721 and parameters: {'max_depth': 9, 'n_estimators': 200, 'learning_rate': 0.12773505139998823, 'min_child_weight': 6, 'subsample': 0.6, 'colsample_bytree': 0.85, 'scale_pos_weight': 1.5, 'threshold': 0.5}. Best is trial 6 with value: 0.978346147811078.


Best trial: 6. Best value: 0.978346:  43%|████▎     | 13/30 [07:36<09:03, 31.94s/it]

[I 2025-11-30 23:10:32,112] Trial 12 finished with value: 0.9779685747705815 and parameters: {'max_depth': 9, 'n_estimators': 200, 'learning_rate': 0.1208402987479609, 'min_child_weight': 7, 'subsample': 1.0, 'colsample_bytree': 0.75, 'scale_pos_weight': 5.5, 'threshold': 0.45}. Best is trial 6 with value: 0.978346147811078.


Best trial: 6. Best value: 0.978346:  47%|████▋     | 14/30 [08:20<09:29, 35.58s/it]

[I 2025-11-30 23:11:16,086] Trial 13 finished with value: 0.9768598824323022 and parameters: {'max_depth': 10, 'n_estimators': 200, 'learning_rate': 0.06582226994923604, 'min_child_weight': 7, 'subsample': 1.0, 'colsample_bytree': 0.75, 'scale_pos_weight': 5.0, 'threshold': 0.45}. Best is trial 6 with value: 0.978346147811078.


Best trial: 14. Best value: 0.978678:  50%|█████     | 15/30 [09:04<09:30, 38.04s/it]

[I 2025-11-30 23:11:59,818] Trial 14 finished with value: 0.97867772980535 and parameters: {'max_depth': 11, 'n_estimators': 200, 'learning_rate': 0.18696193737064473, 'min_child_weight': 4, 'subsample': 0.95, 'colsample_bytree': 0.9, 'scale_pos_weight': 6.5, 'threshold': 0.4}. Best is trial 14 with value: 0.97867772980535.


Best trial: 14. Best value: 0.978678:  53%|█████▎    | 16/30 [09:41<08:48, 37.72s/it]

[I 2025-11-30 23:12:36,791] Trial 15 finished with value: 0.9752689337814557 and parameters: {'max_depth': 12, 'n_estimators': 100, 'learning_rate': 0.18188215906458052, 'min_child_weight': 4, 'subsample': 0.9, 'colsample_bytree': 0.95, 'scale_pos_weight': 6.5, 'threshold': 0.4}. Best is trial 14 with value: 0.97867772980535.


Best trial: 14. Best value: 0.978678:  57%|█████▋    | 17/30 [11:09<11:27, 52.85s/it]

[I 2025-11-30 23:14:04,826] Trial 16 finished with value: 0.9774449657163481 and parameters: {'max_depth': 11, 'n_estimators': 250, 'learning_rate': 0.06464647826498157, 'min_child_weight': 2, 'subsample': 0.95, 'colsample_bytree': 0.9, 'scale_pos_weight': 9.5, 'threshold': 0.4}. Best is trial 14 with value: 0.97867772980535.


Best trial: 14. Best value: 0.978678:  60%|██████    | 18/30 [11:35<08:57, 44.78s/it]

[I 2025-11-30 23:14:30,840] Trial 17 finished with value: 0.9781076424449077 and parameters: {'max_depth': 8, 'n_estimators': 150, 'learning_rate': 0.19554225148686344, 'min_child_weight': 4, 'subsample': 0.8, 'colsample_bytree': 0.95, 'scale_pos_weight': 6.5, 'threshold': 0.45}. Best is trial 14 with value: 0.97867772980535.


Best trial: 14. Best value: 0.978678:  63%|██████▎   | 19/30 [12:39<09:14, 50.41s/it]

[I 2025-11-30 23:15:34,358] Trial 18 finished with value: 0.9711286089238843 and parameters: {'max_depth': 11, 'n_estimators': 150, 'learning_rate': 0.04185197959534649, 'min_child_weight': 5, 'subsample': 0.95, 'colsample_bytree': 0.9, 'scale_pos_weight': 7.5, 'threshold': 0.4}. Best is trial 14 with value: 0.97867772980535.


Best trial: 19. Best value: 0.980139:  67%|██████▋   | 20/30 [13:13<07:34, 45.49s/it]

[I 2025-11-30 23:16:08,366] Trial 19 finished with value: 0.9801388469664957 and parameters: {'max_depth': 8, 'n_estimators': 250, 'learning_rate': 0.185571745314094, 'min_child_weight': 8, 'subsample': 0.9, 'colsample_bytree': 0.95, 'scale_pos_weight': 4.5, 'threshold': 0.45}. Best is trial 19 with value: 0.9801388469664957.


Best trial: 19. Best value: 0.980139:  70%|███████   | 21/30 [14:32<08:20, 55.63s/it]

[I 2025-11-30 23:17:27,649] Trial 20 finished with value: 0.979213442456725 and parameters: {'max_depth': 10, 'n_estimators': 350, 'learning_rate': 0.0805302591492868, 'min_child_weight': 2, 'subsample': 0.85, 'colsample_bytree': 0.85, 'scale_pos_weight': 4.5, 'threshold': 0.4}. Best is trial 19 with value: 0.9801388469664957.


Best trial: 19. Best value: 0.980139:  73%|███████▎  | 22/30 [15:55<08:31, 63.92s/it]

[I 2025-11-30 23:18:50,887] Trial 21 finished with value: 0.9794747962571688 and parameters: {'max_depth': 10, 'n_estimators': 350, 'learning_rate': 0.08422524892255819, 'min_child_weight': 1, 'subsample': 0.85, 'colsample_bytree': 0.85, 'scale_pos_weight': 4.5, 'threshold': 0.4}. Best is trial 19 with value: 0.9801388469664957.


Best trial: 19. Best value: 0.980139:  77%|███████▋  | 23/30 [17:22<08:16, 70.93s/it]

[I 2025-11-30 23:20:18,174] Trial 22 finished with value: 0.9791147961033624 and parameters: {'max_depth': 10, 'n_estimators': 350, 'learning_rate': 0.0804332542728112, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': 4.5, 'threshold': 0.45}. Best is trial 19 with value: 0.9801388469664957.


Best trial: 19. Best value: 0.980139:  80%|████████  | 24/30 [19:08<08:07, 81.20s/it]

[I 2025-11-30 23:22:03,344] Trial 23 finished with value: 0.975534910587499 and parameters: {'max_depth': 10, 'n_estimators': 350, 'learning_rate': 0.030147309977275962, 'min_child_weight': 2, 'subsample': 0.85, 'colsample_bytree': 0.85, 'scale_pos_weight': 2.5, 'threshold': 0.35000000000000003}. Best is trial 19 with value: 0.9801388469664957.


Best trial: 19. Best value: 0.980139:  83%|████████▎ | 25/30 [19:52<05:50, 70.07s/it]

[I 2025-11-30 23:22:47,453] Trial 24 finished with value: 0.9798620623362893 and parameters: {'max_depth': 7, 'n_estimators': 350, 'learning_rate': 0.08128520962252302, 'min_child_weight': 2, 'subsample': 0.9, 'colsample_bytree': 0.8, 'scale_pos_weight': 5.5, 'threshold': 0.4}. Best is trial 19 with value: 0.9801388469664957.


Best trial: 25. Best value: 0.981351:  87%|████████▋ | 26/30 [20:40<04:14, 63.58s/it]

[I 2025-11-30 23:23:35,872] Trial 25 finished with value: 0.9813506606230109 and parameters: {'max_depth': 7, 'n_estimators': 400, 'learning_rate': 0.08981654015231944, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.8, 'scale_pos_weight': 5.5, 'threshold': 0.45}. Best is trial 25 with value: 0.9813506606230109.


Best trial: 25. Best value: 0.981351:  90%|█████████ | 27/30 [21:23<02:52, 57.35s/it]

[I 2025-11-30 23:24:18,701] Trial 26 finished with value: 0.9812621488162073 and parameters: {'max_depth': 7, 'n_estimators': 400, 'learning_rate': 0.15728721822716504, 'min_child_weight': 3, 'subsample': 0.9, 'colsample_bytree': 0.8, 'scale_pos_weight': 5.5, 'threshold': 0.45}. Best is trial 25 with value: 0.9813506606230109.


Best trial: 25. Best value: 0.981351:  93%|█████████▎| 28/30 [22:01<01:43, 51.73s/it]

[I 2025-11-30 23:24:57,313] Trial 27 finished with value: 0.979115582988078 and parameters: {'max_depth': 6, 'n_estimators': 400, 'learning_rate': 0.15183422653050652, 'min_child_weight': 3, 'subsample': 0.9, 'colsample_bytree': 0.75, 'scale_pos_weight': 3.0, 'threshold': 0.45}. Best is trial 25 with value: 0.9813506606230109.


Best trial: 25. Best value: 0.981351:  97%|█████████▋| 29/30 [22:39<00:47, 47.50s/it]

[I 2025-11-30 23:25:34,948] Trial 28 finished with value: 0.979746040942449 and parameters: {'max_depth': 7, 'n_estimators': 400, 'learning_rate': 0.2102934635913294, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': 5.5, 'threshold': 0.45}. Best is trial 25 with value: 0.9813506606230109.


Best trial: 25. Best value: 0.981351: 100%|██████████| 30/30 [23:17<00:00, 46.59s/it]

[I 2025-11-30 23:26:13,105] Trial 29 finished with value: 0.9787431017277242 and parameters: {'max_depth': 5, 'n_estimators': 500, 'learning_rate': 0.1022268832486803, 'min_child_weight': 3, 'subsample': 0.75, 'colsample_bytree': 0.7, 'scale_pos_weight': 6.0, 'threshold': 0.5}. Best is trial 25 with value: 0.9813506606230109.

Best score: 0.9814
Recall: 0.9855
ROC-AUC: 0.9979
Precision: 0.9650
F1: 0.9751

Best params:
  max_depth: 7
  n_estimators: 400
  learning_rate: 0.08981654015231944
  min_child_weight: 1
  subsample: 0.9
  colsample_bytree: 0.8
  scale_pos_weight: 5.5
  threshold: 0.45





In [11]:
fig = optuna.visualization.plot_optimization_history(xgb_study)
fig.update_layout(width=800, height=500) 
fig.show()

fig = optuna.visualization.plot_param_importances(xgb_study)
fig.update_layout(width=800, height=500) 
fig.show()

## MLP Tuning

In [18]:
from hybrid_mlp_model import HybridMLPClassifier

def mlp_objective(trial):
    params = {
        'feature_hidden_dim': trial.suggest_categorical('feature_hidden_dim', [16, 32, 64, 128]),
        'dropout': trial.suggest_float('dropout', 0.1, 0.5, step=0.1),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128]),
        'pos_weight': trial.suggest_float('pos_weight', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }
    
    X_emb_train_fit, X_emb_val, X_feat_train_fit, X_feat_val, y_train_fit, y_val = train_test_split(
        X_emb_train, features_train, y_train,
        test_size=0.2,
        random_state=42,
        stratify=y_train
    )
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = HybridMLPClassifier(
        embedding_dim=config["embedding_dim"],
        num_features=len(numeric_features),
        feature_hidden_dim=params['feature_hidden_dim'],
        dropout=params['dropout']
    ).to(device)
    
    train_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_train_fit),
            torch.FloatTensor(X_feat_train_fit),
            torch.FloatTensor(y_train_fit)
        ),
        batch_size=params['batch_size'],
        shuffle=True
    )
    
    val_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_val),
            torch.FloatTensor(X_feat_val),
            torch.FloatTensor(y_val)
        ),
        batch_size=params['batch_size'],
        shuffle=False
    )
    
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=params['lr'],
        weight_decay=params['weight_decay']
    )
    
    criterion = nn.BCEWithLogitsLoss(
        pos_weight=torch.tensor([params['pos_weight']], device=device)
    )
    
    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0
    
    for epoch in range(30):
        model.train()
        for embeddings, features, labels in train_loader:
            embeddings, features, labels = embeddings.to(device), features.to(device), labels.to(device)
            
            optimizer.zero_grad()
            logits = model(embeddings, features).squeeze()
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for embeddings, features, labels in val_loader:
                embeddings, features, labels = embeddings.to(device), features.to(device), labels.to(device)
                logits = model(embeddings, features).squeeze()
                val_loss += criterion(logits, labels).item()
        
        val_loss /= len(val_loader)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    test_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_test),
            torch.FloatTensor(features_test),
            torch.FloatTensor(y_test)
        ),
        batch_size=params['batch_size'],
        shuffle=False
    )
    
    model.eval()
    test_probs = []
    test_labels = []
    
    with torch.no_grad():
        for embeddings, features, labels in test_loader:
            embeddings, features = embeddings.to(device), features.to(device)
            logits = model(embeddings, features).squeeze()
            probs = torch.sigmoid(logits)
            
            if probs.dim() == 0:
                test_probs.append(probs.cpu().item())
                test_labels.append(labels.item())
            else:
                test_probs.extend(probs.cpu().numpy())
                test_labels.extend(labels.numpy())
    
    test_probs = np.array(test_probs)
    test_labels = np.array(test_labels)
    test_preds = (test_probs >= params['threshold']).astype(int)
    
    recall = recall_score(test_labels, test_preds, zero_division=0)
    precision = precision_score(test_labels, test_preds, zero_division=0)
    f1 = f1_score(test_labels, test_preds, zero_division=0)
    roc_auc = roc_auc_score(test_labels, test_probs)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    
    F2 = (5 * precision * recall) / (4 * precision + recall)
    return F2

In [19]:
print("Starting MLP tuning...")

wandb_callback_mlp = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "mlp"]}
)

mlp_study = optuna.create_study(
    direction="maximize",
    study_name="mlp-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

mlp_study.optimize(
    mlp_objective,
    n_trials=config["n_trials_mlp"],
    callbacks=[wandb_callback_mlp],
    show_progress_bar=True
)

print(f"\nBest score: {mlp_study.best_trial.value:.4f}")
print(f"Recall: {mlp_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {mlp_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {mlp_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {mlp_study.best_trial.user_attrs['f1_score']:.4f}")
print("\nBest params:")
for k, v in mlp_study.best_trial.params.items():
    print(f"  {k}: {v}")

Starting MLP tuning...



WeightsAndBiasesCallback is experimental (supported from v2.9.0). The interface can change in the future.



[I 2025-11-30 23:31:23,044] A new study created in memory with name: mlp-tuning
Best trial: 0. Best value: 0.979567:   3%|▎         | 1/30 [01:33<45:02, 93.17s/it]

[I 2025-11-30 23:32:56,212] Trial 0 finished with value: 0.9795673804364932 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.1, 'lr': 0.00020511104188433984, 'weight_decay': 1.3066739238053272e-05, 'batch_size': 32, 'pos_weight': 1.0, 'threshold': 0.5}. Best is trial 0 with value: 0.9795673804364932.


Best trial: 1. Best value: 0.98136:   7%|▋         | 2/30 [02:17<30:01, 64.35s/it] 

[I 2025-11-30 23:33:40,382] Trial 1 finished with value: 0.9813601106367627 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.2, 'lr': 0.0011207606211860567, 'weight_decay': 7.309539835912905e-05, 'batch_size': 64, 'pos_weight': 3.5, 'threshold': 0.30000000000000004}. Best is trial 1 with value: 0.9813601106367627.


Best trial: 1. Best value: 0.98136:  10%|█         | 3/30 [03:38<32:21, 71.90s/it]

[I 2025-11-30 23:35:01,282] Trial 2 finished with value: 0.9810149382775349 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.30000000000000004, 'lr': 0.0001238513729886094, 'weight_decay': 0.000164092867306479, 'batch_size': 128, 'pos_weight': 10.0, 'threshold': 0.45}. Best is trial 1 with value: 0.9813601106367627.


Best trial: 3. Best value: 0.982216:  13%|█▎        | 4/30 [04:54<31:51, 73.51s/it]

[I 2025-11-30 23:36:17,256] Trial 3 finished with value: 0.9822160598022668 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.1, 'lr': 0.0009780337016659412, 'weight_decay': 1.1715937392307055e-05, 'batch_size': 32, 'pos_weight': 3.5, 'threshold': 0.35000000000000003}. Best is trial 3 with value: 0.9822160598022668.


Best trial: 3. Best value: 0.982216:  17%|█▋        | 5/30 [06:30<33:59, 81.57s/it]

[I 2025-11-30 23:37:53,117] Trial 4 finished with value: 0.961823953266629 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.006161049539380964, 'weight_decay': 0.0001569639638866114, 'batch_size': 32, 'pos_weight': 1.0, 'threshold': 0.30000000000000004}. Best is trial 3 with value: 0.9822160598022668.


Best trial: 3. Best value: 0.982216:  20%|██        | 6/30 [06:54<24:53, 62.23s/it]

[I 2025-11-30 23:38:17,800] Trial 5 finished with value: 0.9809971124237669 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.2, 'lr': 0.001217284708112243, 'weight_decay': 1.913588048769229e-05, 'batch_size': 128, 'pos_weight': 8.0, 'threshold': 0.25}. Best is trial 3 with value: 0.9822160598022668.


Best trial: 6. Best value: 0.9824:  23%|██▎       | 7/30 [08:58<31:35, 82.43s/it]  

[I 2025-11-30 23:40:21,833] Trial 6 finished with value: 0.9824002883229217 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.4, 'lr': 0.00014063366777718192, 'weight_decay': 5.211124595788268e-05, 'batch_size': 64, 'pos_weight': 4.0, 'threshold': 0.2}. Best is trial 6 with value: 0.9824002883229217.


Best trial: 6. Best value: 0.9824:  27%|██▋       | 8/30 [10:15<29:30, 80.47s/it]

[I 2025-11-30 23:41:38,100] Trial 7 finished with value: 0.9808228689083879 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.0008798929749689024, 'weight_decay': 1.7345566642360933e-05, 'batch_size': 64, 'pos_weight': 8.0, 'threshold': 0.35000000000000003}. Best is trial 6 with value: 0.9824002883229217.


Best trial: 6. Best value: 0.9824:  30%|███       | 9/30 [10:50<23:13, 66.37s/it]

[I 2025-11-30 23:42:13,479] Trial 8 finished with value: 0.9803567124608339 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.1, 'lr': 0.0018742210985555703, 'weight_decay': 4.253162363790868e-05, 'batch_size': 64, 'pos_weight': 4.5, 'threshold': 0.45}. Best is trial 6 with value: 0.9824002883229217.


Best trial: 6. Best value: 0.9824:  33%|███▎      | 10/30 [12:39<26:28, 79.42s/it]

[I 2025-11-30 23:44:02,113] Trial 9 finished with value: 0.9727563911744043 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.004132765459466363, 'weight_decay': 0.00018484491720988634, 'batch_size': 32, 'pos_weight': 9.0, 'threshold': 0.35000000000000003}. Best is trial 6 with value: 0.9824002883229217.


Best trial: 6. Best value: 0.9824:  37%|███▋      | 11/30 [14:32<28:25, 89.74s/it]

[I 2025-11-30 23:45:55,263] Trial 10 finished with value: 0.9780485480835656 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.000319767716166678, 'weight_decay': 0.0007321173243252594, 'batch_size': 64, 'pos_weight': 6.0, 'threshold': 0.2}. Best is trial 6 with value: 0.9824002883229217.


Best trial: 6. Best value: 0.9824:  40%|████      | 12/30 [16:06<27:22, 91.27s/it]

[I 2025-11-30 23:47:30,022] Trial 11 finished with value: 0.9818367715162086 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.30000000000000004, 'lr': 0.0004379570480766999, 'weight_decay': 3.8932422992272074e-05, 'batch_size': 32, 'pos_weight': 3.0, 'threshold': 0.2}. Best is trial 6 with value: 0.9824002883229217.


Best trial: 12. Best value: 0.982977:  43%|████▎     | 13/30 [18:13<28:51, 101.88s/it]

[I 2025-11-30 23:49:36,318] Trial 12 finished with value: 0.9829769816045458 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.00011380536180913103, 'weight_decay': 1.0767949854680992e-05, 'batch_size': 64, 'pos_weight': 6.0, 'threshold': 0.4}. Best is trial 12 with value: 0.9829769816045458.


Best trial: 12. Best value: 0.982977:  47%|████▋     | 14/30 [20:26<29:39, 111.21s/it]

[I 2025-11-30 23:51:49,096] Trial 13 finished with value: 0.9826416000961017 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.00012325004584642538, 'weight_decay': 0.00046731986792036123, 'batch_size': 64, 'pos_weight': 6.0, 'threshold': 0.4}. Best is trial 12 with value: 0.9829769816045458.


Best trial: 12. Best value: 0.982977:  50%|█████     | 15/30 [23:03<31:17, 125.16s/it]

[I 2025-11-30 23:54:26,575] Trial 14 finished with value: 0.9800435761145431 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.00010159708810654136, 'weight_decay': 0.0006985380177719455, 'batch_size': 64, 'pos_weight': 6.0, 'threshold': 0.4}. Best is trial 12 with value: 0.9829769816045458.


Best trial: 12. Best value: 0.982977:  53%|█████▎    | 16/30 [26:52<36:30, 156.48s/it]

[I 2025-11-30 23:58:15,806] Trial 15 finished with value: 0.9799448153943549 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0004044758550598607, 'weight_decay': 0.0003809644409289987, 'batch_size': 64, 'pos_weight': 6.5, 'threshold': 0.4}. Best is trial 12 with value: 0.9829769816045458.


Best trial: 16. Best value: 0.98304:  57%|█████▋    | 17/30 [28:14<29:01, 133.96s/it] 

[I 2025-11-30 23:59:37,395] Trial 16 finished with value: 0.9830404281478268 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.30000000000000004, 'lr': 0.0002540994280961815, 'weight_decay': 0.0003358499007747041, 'batch_size': 128, 'pos_weight': 5.0, 'threshold': 0.45}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  60%|██████    | 18/30 [29:20<22:43, 113.64s/it]

[I 2025-12-01 00:00:43,741] Trial 17 finished with value: 0.9819868666787155 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.2, 'lr': 0.00024368525326204198, 'weight_decay': 0.0003327528914795915, 'batch_size': 128, 'pos_weight': 7.5, 'threshold': 0.5}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  63%|██████▎   | 19/30 [30:00<16:44, 91.35s/it] 

[I 2025-12-01 00:01:23,162] Trial 18 finished with value: 0.9828416616496087 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.30000000000000004, 'lr': 0.00045286696635910966, 'weight_decay': 2.662322888315536e-05, 'batch_size': 128, 'pos_weight': 5.0, 'threshold': 0.45}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  67%|██████▋   | 20/30 [31:19<14:37, 87.71s/it]

[I 2025-12-01 00:02:42,396] Trial 19 finished with value: 0.9823274142662008 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.30000000000000004, 'lr': 0.00022670758563116434, 'weight_decay': 0.00010139427152928492, 'batch_size': 128, 'pos_weight': 2.5, 'threshold': 0.45}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  70%|███████   | 21/30 [32:37<12:43, 84.79s/it]

[I 2025-12-01 00:04:00,370] Trial 20 finished with value: 0.9822437546584599 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0006574882851596078, 'weight_decay': 0.00022082265552359086, 'batch_size': 128, 'pos_weight': 5.0, 'threshold': 0.5}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  73%|███████▎  | 22/30 [33:29<09:59, 74.98s/it]

[I 2025-12-01 00:04:52,460] Trial 21 finished with value: 0.981990898885898 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.30000000000000004, 'lr': 0.0005866838732336063, 'weight_decay': 2.4856896322112527e-05, 'batch_size': 128, 'pos_weight': 5.0, 'threshold': 0.45}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  77%|███████▋  | 23/30 [34:30<08:15, 70.76s/it]

[I 2025-12-01 00:05:53,386] Trial 22 finished with value: 0.9812697727926372 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.2, 'lr': 0.0001874145882135894, 'weight_decay': 2.5607428732849214e-05, 'batch_size': 128, 'pos_weight': 7.0, 'threshold': 0.4}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  80%|████████  | 24/30 [35:15<06:17, 63.00s/it]

[I 2025-12-01 00:06:38,282] Trial 23 finished with value: 0.9822859552931251 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.30000000000000004, 'lr': 0.0003128849267047166, 'weight_decay': 1.0716239797269517e-05, 'batch_size': 128, 'pos_weight': 5.0, 'threshold': 0.45}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  83%|████████▎ | 25/30 [36:34<05:38, 67.76s/it]

[I 2025-12-01 00:07:57,163] Trial 24 finished with value: 0.9825511940244475 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.30000000000000004, 'lr': 0.00016388226304143655, 'weight_decay': 8.72084858504081e-05, 'batch_size': 128, 'pos_weight': 2.0, 'threshold': 0.4}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  87%|████████▋ | 26/30 [37:14<03:58, 59.56s/it]

[I 2025-12-01 00:08:37,595] Trial 25 finished with value: 0.9825529663564438 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.2, 'lr': 0.0005222161302790296, 'weight_decay': 2.3824794654298213e-05, 'batch_size': 128, 'pos_weight': 4.5, 'threshold': 0.5}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  90%|█████████ | 27/30 [38:34<03:17, 65.78s/it]

[I 2025-12-01 00:09:57,892] Trial 26 finished with value: 0.9804650493470362 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.00030696448445063863, 'weight_decay': 0.0009884157497089987, 'batch_size': 128, 'pos_weight': 5.5, 'threshold': 0.45}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  93%|█████████▎| 28/30 [39:08<01:52, 56.23s/it]

[I 2025-12-01 00:10:31,844] Trial 27 finished with value: 0.9823456597243085 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.30000000000000004, 'lr': 0.001525543829793603, 'weight_decay': 1.5698388730328185e-05, 'batch_size': 128, 'pos_weight': 7.0, 'threshold': 0.4}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304:  97%|█████████▋| 29/30 [40:26<01:02, 62.65s/it]

[I 2025-12-01 00:11:49,449] Trial 28 finished with value: 0.9805543566872873 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.00010288381788614386, 'weight_decay': 3.378313289367936e-05, 'batch_size': 128, 'pos_weight': 4.0, 'threshold': 0.30000000000000004}. Best is trial 16 with value: 0.9830404281478268.


Best trial: 16. Best value: 0.98304: 100%|██████████| 30/30 [41:33<00:00, 83.12s/it]

[I 2025-12-01 00:12:56,763] Trial 29 finished with value: 0.9813573397200591 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.4, 'lr': 0.00017920349900006513, 'weight_decay': 1.0197122484200126e-05, 'batch_size': 64, 'pos_weight': 5.5, 'threshold': 0.5}. Best is trial 16 with value: 0.9830404281478268.

Best score: 0.9830
Recall: 0.9873
ROC-AUC: 0.9981
Precision: 0.9662
F1: 0.9766

Best params:
  feature_hidden_dim: 128
  dropout: 0.30000000000000004
  lr: 0.0002540994280961815
  weight_decay: 0.0003358499007747041
  batch_size: 128
  pos_weight: 5.0
  threshold: 0.45





In [20]:
fig = optuna.visualization.plot_optimization_history(mlp_study)
fig.update_layout(width=800, height=500) 
fig.show()

fig = optuna.visualization.plot_param_importances(mlp_study)
fig.update_layout(width=800, height=500) 
fig.show()

## Save Results

In [21]:
results = {
    'tfidf': {
        'best_score': tfidf_study.best_trial.value,
        'recall': tfidf_study.best_trial.user_attrs['recall'],
        'roc_auc': tfidf_study.best_trial.user_attrs['roc_auc'],
        'precision': tfidf_study.best_trial.user_attrs['precision'],
        'f1_score': tfidf_study.best_trial.user_attrs['f1_score'],
        'best_params': tfidf_study.best_trial.params
    },
    'xgboost': {
        'best_score': xgb_study.best_trial.value,
        'recall': xgb_study.best_trial.user_attrs['recall'],
        'roc_auc': xgb_study.best_trial.user_attrs['roc_auc'],
        'precision': xgb_study.best_trial.user_attrs['precision'],
        'f1_score': xgb_study.best_trial.user_attrs['f1_score'],
        'best_params': xgb_study.best_trial.params
    },
    'mlp': {
        'best_score': mlp_study.best_trial.value,
        'recall': mlp_study.best_trial.user_attrs['recall'],
        'roc_auc': mlp_study.best_trial.user_attrs['roc_auc'],
        'precision': mlp_study.best_trial.user_attrs['precision'],
        'f1_score': mlp_study.best_trial.user_attrs['f1_score'],
        'best_params': mlp_study.best_trial.params
    }
}

os.makedirs("../../output/tuning_results", exist_ok=True)

with open('../../output/tuning_results/optuna_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved to output/tuning_results/optuna_results.json")
print("\nSummary:")
for model, res in results.items():
    print(f"\n{model.upper()}:")
    print(f"  Score F2 = (5 * precision * recall) / (4 * precision + recall): {res['best_score']:.4f}")
    print(f"  Recall: {res['recall']:.4f}")
    print(f"  ROC-AUC: {res['roc_auc']:.4f}")
    print(f"  Precision: {res['precision']:.4f}")
    print(f"  F1 Score: {res['f1_score']:.4f}")

Results saved to output/tuning_results/optuna_results.json

Summary:

TFIDF:
  Score F2 = (5 * precision * recall) / (4 * precision + recall): 0.9799
  Recall: 0.9918
  ROC-AUC: 0.9974
  Precision: 0.9352
  F1 Score: 0.9626

XGBOOST:
  Score F2 = (5 * precision * recall) / (4 * precision + recall): 0.9814
  Recall: 0.9855
  ROC-AUC: 0.9979
  Precision: 0.9650
  F1 Score: 0.9751

MLP:
  Score F2 = (5 * precision * recall) / (4 * precision + recall): 0.9830
  Recall: 0.9873
  ROC-AUC: 0.9981
  Precision: 0.9662
  F1 Score: 0.9766
