# Hyperparameter Tuning for Phishing Detection Models

Optuna-based hyperparameter optimization for:
- TF-IDF + Logistic Regression
- XGBoost Hybrid
- MLP Hybrid

**Objective:** Maximize Recall + ROC-AUC (catch phishing + good ranking)

## Setup and Imports

In [28]:
import os
import sys
import json
import pickle
from datetime import datetime
from pathlib import Path
import numpy as np
import polars as pl
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
     precision_score, recall_score, f1_score, roc_auc_score,
)
from sentence_transformers import SentenceTransformer
import xgboost as xgb
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback

sys.path.append(str(Path.cwd().parent.parent))

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
print(f"Optuna version: {optuna.__version__}")

PyTorch version: 2.5.1+cu121
CUDA available: True
Device: cuda
Optuna version: 4.6.0


## Configuration

In [29]:
config = {
    "random_seed": 42,
    "data_source": "../../emails_v6.parquet",
    "numeric_features": [
        "sender_domain_entropy", "has_attachment",
        "spf_flag_missing", "dkim_flag_missing",
        "num_links", "subject_length", "body_length", "keyword_count", 
        "num_received_headers", "num_exclamation_marks", 
        "num_malicious_links"
    ],
    "embedding_model": "all-MiniLM-L6-v2",
    "embedding_dim": 384,
    "n_trials_tfidf": 30,
    "n_trials_xgb": 30,
    "n_trials_mlp": 30
}

In [30]:

df = pl.read_parquet("../../emails_v6.parquet")
from app.data_processing.preprocessing_pipeline import PreprocessingPipeline

df = PreprocessingPipeline().process_pipeline(df)
external_df = df.filter(
    (pl.col("source") == "data/csv_misc/TREC-07.csv") | (pl.col("source") == "phishing-2020")
)

train_pool_df = df.filter(
    (pl.col("source") != "data/csv_misc/TREC-07.csv") & (pl.col("source") != "phishing-2020")
)

train_pool_pd = train_pool_df.to_pandas()

train_val_df_pd, test_df_pd = train_test_split(
    train_pool_pd, 
    test_size=0.2, 
    random_state=config["random_seed"], 
    stratify=train_pool_pd['phishing']
)
train_df_pd, val_df_pd = train_test_split(
    train_val_df_pd,
    test_size=0.2,  
    random_state=config["random_seed"],
    stratify=train_val_df_pd['phishing']
)

train_df = pl.from_pandas(train_df_pd)
val_df = pl.from_pandas(val_df_pd)
test_df = pl.from_pandas(test_df_pd)

print(f"\nFinal splits:")
print(f"  Train: {len(train_df):,} ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Validation: {len(val_df):,} ({len(val_df)/len(df)*100:.1f}%) ")
print(f"  Test: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")
print(f"  External: {len(external_df)} ({len(external_df)/len(df)*100:.1f}%")

for name, split_df in [("Train", train_df), ("Validation", val_df), ("Test", test_df), ("External", external_df)]:
    phish = split_df.filter(pl.col("phishing") == 1).shape[0]
    total = len(split_df)
    print(f"  {name}: {phish}/{total} phishing ({phish/total*100:.1f}%)")

numeric_features = config["numeric_features"]

features_train = train_df.select(numeric_features).fill_null(0).to_numpy()
features_val = val_df.select(numeric_features).fill_null(0).to_numpy()
features_test = test_df.select(numeric_features).fill_null(0).to_numpy()

y_train = train_df['phishing'].to_numpy()
y_val = val_df['phishing'].to_numpy()
y_test = test_df['phishing'].to_numpy()


Final splits:
  Train: 101,649 (47.9%)
  Validation: 25,413 (12.0%) 
  Test: 31766 (15.0%)
  External: 53285 (25.1%
  Train: 42291/101649 phishing (41.6%)
  Validation: 10573/25413 phishing (41.6%)
  Test: 13216/31766 phishing (41.6%)
  External: 28983/53285 phishing (54.4%)


## Generate Embeddings

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer(config["embedding_model"], device=device)

X_emb_train = embedding_model.encode(
    train_df['body_subject'].to_list(), 
    show_progress_bar=True, 
    convert_to_numpy=True,
    batch_size=32
)

X_emb_val = embedding_model.encode(
    val_df['body_subject'].to_list(), 
    show_progress_bar=True, 
    convert_to_numpy=True,
    batch_size=32
)

X_emb_test = embedding_model.encode(
    test_df['body_subject'].to_list(), 
    show_progress_bar=True, 
    convert_to_numpy=True,
    batch_size=32
)

print(f"  Train: {X_emb_train.shape}")
print(f"  Validation: {X_emb_val.shape}")
print(f"  Test: {X_emb_test.shape}")

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: ae26560a-196c-4d96-8562-28ff56dd866c)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
Retrying in 1s [Retry 1/5].


Batches: 100%|██████████| 3177/3177 [02:52<00:00, 18.41it/s] 

Batches: 100%|██████████| 795/795 [00:47<00:00, 16.64it/s] 

Batches: 100%|██████████| 993/993 [00:57<00:00, 17.21it/s] 



  Train: (101649, 384)
  Validation: (25413, 384)
  Test: (31766, 384)


## TF-IDF Tuning

In [32]:
from scipy.sparse import hstack

def tfidf_objective(trial):
    params = {
        'max_features': trial.suggest_int('max_features', 3000, 10000, step=1000),
        'min_df': trial.suggest_int('min_df', 2, 10),
        'max_df': trial.suggest_float('max_df', 0.5, 0.95, step=0.05),
        'ngram_max': trial.suggest_int('ngram_max', 1, 3),
        'C': trial.suggest_float('C', 0.1, 10.0, log=True),
        'class_weight_ratio': trial.suggest_float('class_weight_ratio', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }
    
    vectorizer = TfidfVectorizer(
        max_features=params['max_features'],
        ngram_range=(1, params['ngram_max']),
        min_df=params['min_df'],
        max_df=params['max_df']
    )

    X_tfidf_train = vectorizer.fit_transform(train_df['body_subject'].to_list())
    X_tfidf_val = vectorizer.transform(val_df['body_subject'].to_list())
    
    X_combined_train = hstack([X_tfidf_train, features_train])
    X_combined_val = hstack([X_tfidf_val, features_val])
    
    clf = LogisticRegression(
        max_iter=1000,
        C=params['C'],
        class_weight={0: 1.0, 1: params['class_weight_ratio']},
        random_state=42,
        verbose=0
    )
    clf.fit(X_combined_train, y_train)
    
    y_proba = clf.predict_proba(X_combined_val)[:, 1]
    y_pred = (y_proba >= params['threshold']).astype(int)
    
    recall = recall_score(y_val, y_pred, zero_division=0)
    precision = precision_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_val, y_proba)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    
    if precision + recall > 0:
        F2 = (5 * precision * recall) / (4 * precision + recall)
    else:
        F2 = 0
    return F2

In [33]:
wandb_callback = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "tfidf"]}
)

tfidf_study = optuna.create_study(
    direction="maximize",
    study_name="tfidf-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

tfidf_study.optimize(
    tfidf_objective,
    n_trials=config["n_trials_tfidf"],
    callbacks=[wandb_callback],
    show_progress_bar=True
)

print(f"Best score: {tfidf_study.best_trial.value:.4f}")
print(f"Recall: {tfidf_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {tfidf_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {tfidf_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {tfidf_study.best_trial.user_attrs['f1_score']:.4f}")
print("Best params:")
for k, v in tfidf_study.best_trial.params.items():
    print(f"  {k}: {v}")


WeightsAndBiasesCallback is experimental (supported from v2.9.0). The interface can change in the future.



[I 2025-12-01 00:52:52,190] A new study created in memory with name: tfidf-tuning
Best trial: 0. Best value: 0.948664:   3%|▎         | 1/30 [01:00<29:02, 60.07s/it]

[I 2025-12-01 00:53:52,258] Trial 0 finished with value: 0.9486643354122354 and parameters: {'max_features': 5000, 'min_df': 10, 'max_df': 0.8500000000000001, 'ngram_max': 2, 'C': 0.20513382630874505, 'class_weight_ratio': 2.0, 'threshold': 0.2}. Best is trial 0 with value: 0.9486643354122354.


Best trial: 1. Best value: 0.97625:   7%|▋         | 2/30 [01:20<17:13, 36.92s/it] 

[I 2025-12-01 00:54:12,979] Trial 1 finished with value: 0.9762501159662305 and parameters: {'max_features': 9000, 'min_df': 7, 'max_df': 0.8500000000000001, 'ngram_max': 1, 'C': 8.706020878304859, 'class_weight_ratio': 8.5, 'threshold': 0.25}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  10%|█         | 3/30 [02:14<20:05, 44.64s/it]

[I 2025-12-01 00:55:06,794] Trial 2 finished with value: 0.9708341831531716 and parameters: {'max_features': 4000, 'min_df': 3, 'max_df': 0.65, 'ngram_max': 2, 'C': 0.7309539835912913, 'class_weight_ratio': 3.5, 'threshold': 0.4}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  13%|█▎        | 4/30 [03:07<20:49, 48.06s/it]

[I 2025-12-01 00:56:00,107] Trial 3 finished with value: 0.9743733109682229 and parameters: {'max_features': 4000, 'min_df': 4, 'max_df': 0.65, 'ngram_max': 2, 'C': 3.7183641805732095, 'class_weight_ratio': 2.5, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  17%|█▋        | 5/30 [03:25<15:28, 37.13s/it]

[I 2025-12-01 00:56:17,859] Trial 4 finished with value: 0.9604854900529293 and parameters: {'max_features': 7000, 'min_df': 2, 'max_df': 0.8, 'ngram_max': 1, 'C': 0.1349283426801325, 'class_weight_ratio': 10.0, 'threshold': 0.5}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  20%|██        | 6/30 [05:18<25:07, 62.83s/it]

[I 2025-12-01 00:58:10,565] Trial 5 finished with value: 0.9737307871448533 and parameters: {'max_features': 9000, 'min_df': 4, 'max_df': 0.5, 'ngram_max': 3, 'C': 0.7591104805282696, 'class_weight_ratio': 2.0, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  23%|██▎       | 7/30 [06:10<22:46, 59.41s/it]

[I 2025-12-01 00:59:02,927] Trial 6 finished with value: 0.9597692757009346 and parameters: {'max_features': 3000, 'min_df': 10, 'max_df': 0.6, 'ngram_max': 2, 'C': 0.420167205437253, 'class_weight_ratio': 5.5, 'threshold': 0.35000000000000003}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  27%|██▋       | 8/30 [07:56<27:08, 74.02s/it]

[I 2025-12-01 01:00:48,238] Trial 7 finished with value: 0.9736062733995196 and parameters: {'max_features': 4000, 'min_df': 10, 'max_df': 0.8500000000000001, 'ngram_max': 3, 'C': 6.161049539380966, 'class_weight_ratio': 6.5, 'threshold': 0.5}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  30%|███       | 9/30 [08:15<19:59, 57.10s/it]

[I 2025-12-01 01:01:08,127] Trial 8 finished with value: 0.9699732600460521 and parameters: {'max_features': 3000, 'min_df': 3, 'max_df': 0.5, 'ngram_max': 1, 'C': 0.59890036722543, 'class_weight_ratio': 3.5, 'threshold': 0.45}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  33%|███▎      | 10/30 [08:36<15:14, 45.71s/it]

[I 2025-12-01 01:01:28,327] Trial 9 finished with value: 0.9735339026183341 and parameters: {'max_features': 5000, 'min_df': 4, 'max_df': 0.75, 'ngram_max': 1, 'C': 4.02155452669029, 'class_weight_ratio': 1.5, 'threshold': 0.5}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  37%|███▋      | 11/30 [08:58<12:15, 38.70s/it]

[I 2025-12-01 01:01:51,127] Trial 10 finished with value: 0.9646813644839314 and parameters: {'max_features': 10000, 'min_df': 7, 'max_df': 0.95, 'ngram_max': 1, 'C': 2.012487082975094, 'class_weight_ratio': 9.0, 'threshold': 0.2}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  40%|████      | 12/30 [09:56<13:17, 44.32s/it]

[I 2025-12-01 01:02:48,306] Trial 11 finished with value: 0.975028734566757 and parameters: {'max_features': 8000, 'min_df': 7, 'max_df': 0.65, 'ngram_max': 2, 'C': 7.791470666172767, 'class_weight_ratio': 7.5, 'threshold': 0.25}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  43%|████▎     | 13/30 [11:40<17:43, 62.57s/it]

[I 2025-12-01 01:04:32,887] Trial 12 finished with value: 0.9750129754578483 and parameters: {'max_features': 8000, 'min_df': 7, 'max_df': 0.95, 'ngram_max': 3, 'C': 7.956692282233659, 'class_weight_ratio': 7.5, 'threshold': 0.25}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  47%|████▋     | 14/30 [11:58<13:06, 49.15s/it]

[I 2025-12-01 01:04:51,027] Trial 13 finished with value: 0.9689245810055865 and parameters: {'max_features': 8000, 'min_df': 8, 'max_df': 0.7, 'ngram_max': 1, 'C': 2.012962831034315, 'class_weight_ratio': 8.0, 'threshold': 0.25}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  50%|█████     | 15/30 [12:55<12:51, 51.41s/it]

[I 2025-12-01 01:05:47,680] Trial 14 finished with value: 0.9674960976953447 and parameters: {'max_features': 10000, 'min_df': 6, 'max_df': 0.8500000000000001, 'ngram_max': 2, 'C': 1.8766870441105825, 'class_weight_ratio': 7.5, 'threshold': 0.25}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  53%|█████▎    | 16/30 [13:50<12:13, 52.39s/it]

[I 2025-12-01 01:06:42,322] Trial 15 finished with value: 0.9726662847215803 and parameters: {'max_features': 7000, 'min_df': 8, 'max_df': 0.6, 'ngram_max': 2, 'C': 9.818247569037462, 'class_weight_ratio': 10.0, 'threshold': 0.30000000000000004}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  57%|█████▋    | 17/30 [14:09<09:11, 42.42s/it]

[I 2025-12-01 01:07:01,574] Trial 16 finished with value: 0.9754853783818866 and parameters: {'max_features': 9000, 'min_df': 6, 'max_df': 0.75, 'ngram_max': 1, 'C': 3.743309561243854, 'class_weight_ratio': 6.0, 'threshold': 0.30000000000000004}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  60%|██████    | 18/30 [14:27<07:02, 35.22s/it]

[I 2025-12-01 01:07:20,021] Trial 17 finished with value: 0.9753223861211615 and parameters: {'max_features': 9000, 'min_df': 6, 'max_df': 0.75, 'ngram_max': 1, 'C': 3.6063715450954663, 'class_weight_ratio': 5.0, 'threshold': 0.30000000000000004}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 1. Best value: 0.97625:  63%|██████▎   | 19/30 [14:46<05:31, 30.16s/it]

[I 2025-12-01 01:07:38,385] Trial 18 finished with value: 0.9733269209426216 and parameters: {'max_features': 9000, 'min_df': 5, 'max_df': 0.9, 'ngram_max': 1, 'C': 1.5564101474248144, 'class_weight_ratio': 4.5, 'threshold': 0.30000000000000004}. Best is trial 1 with value: 0.9762501159662305.


Best trial: 19. Best value: 0.977118:  67%|██████▋   | 20/30 [15:03<04:24, 26.44s/it]

[I 2025-12-01 01:07:56,168] Trial 19 finished with value: 0.9771176009846704 and parameters: {'max_features': 10000, 'min_df': 8, 'max_df': 0.8, 'ngram_max': 1, 'C': 5.277338988315251, 'class_weight_ratio': 6.0, 'threshold': 0.4}. Best is trial 19 with value: 0.9771176009846704.


Best trial: 20. Best value: 0.978354:  70%|███████   | 21/30 [15:22<03:37, 24.18s/it]

[I 2025-12-01 01:08:15,088] Trial 20 finished with value: 0.9783540106553408 and parameters: {'max_features': 10000, 'min_df': 9, 'max_df': 0.8, 'ngram_max': 1, 'C': 5.218690023258398, 'class_weight_ratio': 8.5, 'threshold': 0.4}. Best is trial 20 with value: 0.9783540106553408.


Best trial: 20. Best value: 0.978354:  73%|███████▎  | 22/30 [15:42<03:01, 22.74s/it]

[I 2025-12-01 01:08:34,473] Trial 21 finished with value: 0.9780786709379423 and parameters: {'max_features': 10000, 'min_df': 9, 'max_df': 0.8, 'ngram_max': 1, 'C': 5.515464262018527, 'class_weight_ratio': 8.5, 'threshold': 0.4}. Best is trial 20 with value: 0.9783540106553408.


Best trial: 22. Best value: 0.978366:  77%|███████▋  | 23/30 [16:02<02:33, 22.00s/it]

[I 2025-12-01 01:08:54,728] Trial 22 finished with value: 0.9783661908843588 and parameters: {'max_features': 10000, 'min_df': 9, 'max_df': 0.8, 'ngram_max': 1, 'C': 5.359456741812889, 'class_weight_ratio': 9.0, 'threshold': 0.4}. Best is trial 22 with value: 0.9783661908843588.


Best trial: 22. Best value: 0.978366:  80%|████████  | 24/30 [16:21<02:06, 21.07s/it]

[I 2025-12-01 01:09:13,636] Trial 23 finished with value: 0.9750941156834745 and parameters: {'max_features': 10000, 'min_df': 9, 'max_df': 0.8, 'ngram_max': 1, 'C': 2.778574904923615, 'class_weight_ratio': 9.0, 'threshold': 0.4}. Best is trial 22 with value: 0.9783661908843588.


Best trial: 22. Best value: 0.978366:  83%|████████▎ | 25/30 [16:42<01:44, 20.92s/it]

[I 2025-12-01 01:09:34,212] Trial 24 finished with value: 0.9779853111135967 and parameters: {'max_features': 10000, 'min_df': 9, 'max_df': 0.7, 'ngram_max': 1, 'C': 5.601486345957139, 'class_weight_ratio': 9.5, 'threshold': 0.45}. Best is trial 22 with value: 0.9783661908843588.


Best trial: 22. Best value: 0.978366:  87%|████████▋ | 26/30 [17:02<01:22, 20.71s/it]

[I 2025-12-01 01:09:54,415] Trial 25 finished with value: 0.9753787878787877 and parameters: {'max_features': 8000, 'min_df': 9, 'max_df': 0.9, 'ngram_max': 1, 'C': 1.1951628300309727, 'class_weight_ratio': 7.0, 'threshold': 0.45}. Best is trial 22 with value: 0.9783661908843588.


Best trial: 22. Best value: 0.978366:  90%|█████████ | 27/30 [17:41<01:19, 26.35s/it]

[I 2025-12-01 01:10:33,929] Trial 26 finished with value: 0.9741162084043578 and parameters: {'max_features': 6000, 'min_df': 9, 'max_df': 0.8, 'ngram_max': 1, 'C': 2.4796371897248615, 'class_weight_ratio': 8.5, 'threshold': 0.4}. Best is trial 22 with value: 0.9783661908843588.


Best trial: 27. Best value: 0.979028:  93%|█████████▎| 28/30 [18:48<01:16, 38.39s/it]

[I 2025-12-01 01:11:40,418] Trial 27 finished with value: 0.9790275030787028 and parameters: {'max_features': 10000, 'min_df': 8, 'max_df': 0.9, 'ngram_max': 2, 'C': 5.537277379395257, 'class_weight_ratio': 8.5, 'threshold': 0.45}. Best is trial 27 with value: 0.9790275030787028.


Best trial: 27. Best value: 0.979028:  97%|█████████▋| 29/30 [19:41<00:42, 42.71s/it]

[I 2025-12-01 01:12:33,191] Trial 28 finished with value: 0.9741210032973956 and parameters: {'max_features': 9000, 'min_df': 8, 'max_df': 0.9, 'ngram_max': 2, 'C': 1.201179592539979, 'class_weight_ratio': 9.0, 'threshold': 0.45}. Best is trial 27 with value: 0.9790275030787028.


Best trial: 27. Best value: 0.979028: 100%|██████████| 30/30 [23:03<00:00, 46.12s/it]

[I 2025-12-01 01:15:55,755] Trial 29 finished with value: 0.9741812318128741 and parameters: {'max_features': 6000, 'min_df': 10, 'max_df': 0.9, 'ngram_max': 3, 'C': 2.816469084419117, 'class_weight_ratio': 9.5, 'threshold': 0.45}. Best is trial 27 with value: 0.9790275030787028.
Best score: 0.9790
Recall: 0.9925
ROC-AUC: 0.9973
Precision: 0.9285
F1: 0.9595
Best params:
  max_features: 10000
  min_df: 8
  max_df: 0.9
  ngram_max: 2
  C: 5.537277379395257
  class_weight_ratio: 8.5
  threshold: 0.45





In [34]:
fig = optuna.visualization.plot_optimization_history(tfidf_study)
fig.update_layout(width=800, height=500) 
fig.show()

fig = optuna.visualization.plot_param_importances(tfidf_study)
fig.update_layout(width=800, height=500) 
fig.show()

## XGBoost Tuning

In [35]:
def xgboost_objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }
    
    X_train = np.concatenate([X_emb_train, features_train], axis=1)
    X_val = np.concatenate([X_emb_val, features_val], axis=1)
    
    clf = xgb.XGBClassifier(
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        min_child_weight=params['min_child_weight'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        scale_pos_weight=params['scale_pos_weight'],
        random_state=42,
        eval_metric='logloss',
        early_stopping_rounds=10,
        verbosity=0
    )
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    y_proba = clf.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= params['threshold']).astype(int)
    
    recall = recall_score(y_val, y_pred, zero_division=0)
    precision = precision_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_val, y_proba)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    
    if precision + recall > 0:
        F2 = (5 * precision * recall) / (4 * precision + recall)
    else:
        F2 = 0
    return F2

In [36]:
wandb_callback_xgb = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "xgboost"]}
)

xgb_study = optuna.create_study(
    direction="maximize",
    study_name="xgboost-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

xgb_study.optimize(
    xgboost_objective,
    n_trials=config["n_trials_xgb"],
    callbacks=[wandb_callback_xgb],
    show_progress_bar=True
)

print(f"Best score: {xgb_study.best_trial.value:.4f}")
print(f"Recall: {xgb_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {xgb_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {xgb_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {xgb_study.best_trial.user_attrs['f1_score']:.4f}")
print("Best params:")
for k, v in xgb_study.best_trial.params.items():
    print(f"  {k}: {v}")


WeightsAndBiasesCallback is experimental (supported from v2.9.0). The interface can change in the future.



0,1
C,▁▇▁▄▁▁▁▅▁▄▂▇▇▂▂█▄▄▂▅▅▅▅▃▅▂▃▅▂▃
class_weight_ratio,▁▇▃▂█▁▄▅▃▁▇▆▆▆▆█▅▄▃▅▇▇▇▇█▆▇▇▇█
max_df,▆▆▃▃▆▁▃▆▁▅█▃█▄▆▃▅▅▇▆▆▆▆▆▄▇▆▇▇▇
max_features,▃▇▂▂▅▇▁▂▁▃█▆▆▆█▅▇▇▇██████▆▄█▇▄
min_df,█▅▂▃▁▃██▂▃▅▅▅▆▅▆▅▅▄▆▇▇▇▇▇▇▇▆▆█
ngram_max,▅▁▅▅▁█▅█▁▁▁▅█▁▅▅▁▁▁▁▁▁▁▁▁▁▁▅▅█
objective_score,▁▇▆▇▄▇▄▇▆▇▅▇▇▆▅▇▇▇▇████▇█▇▇█▇▇
threshold,▁▂▆▄█▄▄█▇█▁▂▂▂▂▃▃▃▃▆▆▆▆▆▇▇▆▇▇▇

0,1
C,2.81647
class_weight_ratio,9.5
max_df,0.9
max_features,6000.0
min_df,10.0
ngram_max,3.0
objective_score,0.97418
threshold,0.45


[I 2025-12-01 01:16:18,461] A new study created in memory with name: xgboost-tuning
Best trial: 0. Best value: 0.972406:   3%|▎         | 1/30 [00:37<18:17, 37.84s/it]

[I 2025-12-01 01:16:56,299] Trial 0 finished with value: 0.9724062227942014 and parameters: {'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.1205712628744377, 'min_child_weight': 6, 'subsample': 0.65, 'colsample_bytree': 0.65, 'scale_pos_weight': 1.5, 'threshold': 0.5}. Best is trial 0 with value: 0.9724062227942014.


Best trial: 0. Best value: 0.972406:   7%|▋         | 2/30 [01:55<28:34, 61.23s/it]

[I 2025-12-01 01:18:13,911] Trial 1 finished with value: 0.9542658513503676 and parameters: {'max_depth': 9, 'n_estimators': 400, 'learning_rate': 0.010725209743171996, 'min_child_weight': 10, 'subsample': 0.95, 'colsample_bytree': 0.65, 'scale_pos_weight': 2.5, 'threshold': 0.25}. Best is trial 0 with value: 0.9724062227942014.


Best trial: 0. Best value: 0.972406:  10%|█         | 3/30 [02:23<20:38, 45.88s/it]

[I 2025-12-01 01:18:41,515] Trial 2 finished with value: 0.9648218067962058 and parameters: {'max_depth': 6, 'n_estimators': 300, 'learning_rate': 0.04345454109729477, 'min_child_weight': 3, 'subsample': 0.85, 'colsample_bytree': 0.65, 'scale_pos_weight': 3.5, 'threshold': 0.30000000000000004}. Best is trial 0 with value: 0.9724062227942014.


Best trial: 0. Best value: 0.972406:  13%|█▎        | 4/30 [03:15<21:02, 48.57s/it]

[I 2025-12-01 01:19:34,220] Trial 3 finished with value: 0.9515151515151515 and parameters: {'max_depth': 7, 'n_estimators': 450, 'learning_rate': 0.019721610970574007, 'min_child_weight': 6, 'subsample': 0.85, 'colsample_bytree': 0.6, 'scale_pos_weight': 6.5, 'threshold': 0.25}. Best is trial 0 with value: 0.9724062227942014.


Best trial: 0. Best value: 0.972406:  17%|█▋        | 5/30 [03:40<16:37, 39.91s/it]

[I 2025-12-01 01:19:58,771] Trial 4 finished with value: 0.9699456087916982 and parameters: {'max_depth': 3, 'n_estimators': 500, 'learning_rate': 0.26690431824362526, 'min_child_weight': 9, 'subsample': 0.7, 'colsample_bytree': 0.6, 'scale_pos_weight': 7.5, 'threshold': 0.35000000000000003}. Best is trial 0 with value: 0.9724062227942014.


Best trial: 0. Best value: 0.972406:  20%|██        | 6/30 [04:02<13:30, 33.78s/it]

[I 2025-12-01 01:20:20,641] Trial 5 finished with value: 0.8957885396402854 and parameters: {'max_depth': 4, 'n_estimators': 300, 'learning_rate': 0.011240768803005551, 'min_child_weight': 10, 'subsample': 0.7, 'colsample_bytree': 0.85, 'scale_pos_weight': 3.5, 'threshold': 0.35000000000000003}. Best is trial 0 with value: 0.9724062227942014.


Best trial: 6. Best value: 0.974554:  23%|██▎       | 7/30 [04:25<11:36, 30.29s/it]

[I 2025-12-01 01:20:43,760] Trial 6 finished with value: 0.9745538914249067 and parameters: {'max_depth': 8, 'n_estimators': 150, 'learning_rate': 0.27051668818999286, 'min_child_weight': 8, 'subsample': 1.0, 'colsample_bytree': 1.0, 'scale_pos_weight': 6.5, 'threshold': 0.5}. Best is trial 6 with value: 0.9745538914249067.


Best trial: 6. Best value: 0.974554:  27%|██▋       | 8/30 [04:34<08:40, 23.64s/it]

[I 2025-12-01 01:20:53,149] Trial 7 finished with value: 0.785897988612544 and parameters: {'max_depth': 3, 'n_estimators': 150, 'learning_rate': 0.011662890273931383, 'min_child_weight': 4, 'subsample': 0.75, 'colsample_bytree': 0.7, 'scale_pos_weight': 8.5, 'threshold': 0.30000000000000004}. Best is trial 6 with value: 0.9745538914249067.


Best trial: 6. Best value: 0.974554:  30%|███       | 9/30 [05:00<08:33, 24.46s/it]

[I 2025-12-01 01:21:19,423] Trial 8 finished with value: 0.8847540021791971 and parameters: {'max_depth': 5, 'n_estimators': 300, 'learning_rate': 0.016149614799999188, 'min_child_weight': 9, 'subsample': 0.6, 'colsample_bytree': 1.0, 'scale_pos_weight': 8.0, 'threshold': 0.25}. Best is trial 6 with value: 0.9745538914249067.


Best trial: 6. Best value: 0.974554:  33%|███▎      | 10/30 [05:22<07:51, 23.60s/it]

[I 2025-12-01 01:21:41,078] Trial 9 finished with value: 0.9502802386548544 and parameters: {'max_depth': 3, 'n_estimators': 450, 'learning_rate': 0.11069143219393454, 'min_child_weight': 8, 'subsample': 0.9, 'colsample_bytree': 0.6, 'scale_pos_weight': 4.0, 'threshold': 0.2}. Best is trial 6 with value: 0.9745538914249067.


Best trial: 6. Best value: 0.974554:  37%|███▋      | 11/30 [06:01<08:58, 28.36s/it]

[I 2025-12-01 01:22:20,227] Trial 10 finished with value: 0.9700845027916101 and parameters: {'max_depth': 12, 'n_estimators': 100, 'learning_rate': 0.2704729722717776, 'min_child_weight': 1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'scale_pos_weight': 10.0, 'threshold': 0.5}. Best is trial 6 with value: 0.9745538914249067.


Best trial: 6. Best value: 0.974554:  40%|████      | 12/30 [06:28<08:21, 27.87s/it]

[I 2025-12-01 01:22:46,986] Trial 11 finished with value: 0.9693636036581902 and parameters: {'max_depth': 9, 'n_estimators': 200, 'learning_rate': 0.12773505139998823, 'min_child_weight': 6, 'subsample': 0.6, 'colsample_bytree': 0.85, 'scale_pos_weight': 1.5, 'threshold': 0.5}. Best is trial 6 with value: 0.9745538914249067.


Best trial: 12. Best value: 0.974741:  43%|████▎     | 13/30 [06:56<07:52, 27.81s/it]

[I 2025-12-01 01:23:14,644] Trial 12 finished with value: 0.9747414816070521 and parameters: {'max_depth': 9, 'n_estimators': 200, 'learning_rate': 0.1208402987479609, 'min_child_weight': 7, 'subsample': 1.0, 'colsample_bytree': 0.75, 'scale_pos_weight': 5.5, 'threshold': 0.45}. Best is trial 12 with value: 0.9747414816070521.


Best trial: 12. Best value: 0.974741:  47%|████▋     | 14/30 [07:36<08:24, 31.56s/it]

[I 2025-12-01 01:23:54,873] Trial 13 finished with value: 0.9724390932179475 and parameters: {'max_depth': 10, 'n_estimators': 200, 'learning_rate': 0.06582226994923604, 'min_child_weight': 7, 'subsample': 1.0, 'colsample_bytree': 0.75, 'scale_pos_weight': 5.0, 'threshold': 0.45}. Best is trial 12 with value: 0.9747414816070521.


Best trial: 12. Best value: 0.974741:  50%|█████     | 15/30 [08:19<08:46, 35.08s/it]

[I 2025-12-01 01:24:38,118] Trial 14 finished with value: 0.9737481861183875 and parameters: {'max_depth': 11, 'n_estimators': 200, 'learning_rate': 0.18696193737064473, 'min_child_weight': 4, 'subsample': 0.95, 'colsample_bytree': 0.9, 'scale_pos_weight': 6.5, 'threshold': 0.4}. Best is trial 12 with value: 0.9747414816070521.


Best trial: 12. Best value: 0.974741:  53%|█████▎    | 16/30 [08:36<06:52, 29.47s/it]

[I 2025-12-01 01:24:54,549] Trial 15 finished with value: 0.9653272484416741 and parameters: {'max_depth': 8, 'n_estimators': 100, 'learning_rate': 0.06689802001564371, 'min_child_weight': 8, 'subsample': 0.9, 'colsample_bytree': 0.8, 'scale_pos_weight': 5.5, 'threshold': 0.45}. Best is trial 12 with value: 0.9747414816070521.


Best trial: 16. Best value: 0.976968:  57%|█████▋    | 17/30 [09:09<06:40, 30.79s/it]

[I 2025-12-01 01:25:28,422] Trial 16 finished with value: 0.9769681663117025 and parameters: {'max_depth': 8, 'n_estimators': 250, 'learning_rate': 0.17197313826112898, 'min_child_weight': 8, 'subsample': 1.0, 'colsample_bytree': 0.95, 'scale_pos_weight': 6.5, 'threshold': 0.45}. Best is trial 16 with value: 0.9769681663117025.


Best trial: 16. Best value: 0.976968:  60%|██████    | 18/30 [09:55<07:03, 35.28s/it]

[I 2025-12-01 01:26:14,162] Trial 17 finished with value: 0.9759768233722748 and parameters: {'max_depth': 10, 'n_estimators': 250, 'learning_rate': 0.1744115940116885, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.9, 'scale_pos_weight': 9.5, 'threshold': 0.4}. Best is trial 16 with value: 0.9769681663117025.


Best trial: 16. Best value: 0.976968:  63%|██████▎   | 19/30 [11:47<10:40, 58.25s/it]

[I 2025-12-01 01:28:05,916] Trial 18 finished with value: 0.9734554718819077 and parameters: {'max_depth': 11, 'n_estimators': 350, 'learning_rate': 0.033450877692662105, 'min_child_weight': 4, 'subsample': 0.8, 'colsample_bytree': 0.9, 'scale_pos_weight': 10.0, 'threshold': 0.4}. Best is trial 16 with value: 0.9769681663117025.


Best trial: 16. Best value: 0.976968:  67%|██████▋   | 20/30 [12:32<09:02, 54.22s/it]

[I 2025-12-01 01:28:50,755] Trial 19 finished with value: 0.9752951588302861 and parameters: {'max_depth': 10, 'n_estimators': 250, 'learning_rate': 0.17743720370311442, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.95, 'scale_pos_weight': 9.0, 'threshold': 0.4}. Best is trial 16 with value: 0.9769681663117025.


Best trial: 16. Best value: 0.976968:  70%|███████   | 21/30 [13:12<07:29, 49.96s/it]

[I 2025-12-01 01:29:30,779] Trial 20 finished with value: 0.97623599085423 and parameters: {'max_depth': 7, 'n_estimators': 350, 'learning_rate': 0.08639446512220825, 'min_child_weight': 5, 'subsample': 0.75, 'colsample_bytree': 0.9, 'scale_pos_weight': 9.0, 'threshold': 0.4}. Best is trial 16 with value: 0.9769681663117025.


Best trial: 16. Best value: 0.976968:  73%|███████▎  | 22/30 [13:54<06:22, 47.75s/it]

[I 2025-12-01 01:30:13,377] Trial 21 finished with value: 0.9765507941264608 and parameters: {'max_depth': 7, 'n_estimators': 350, 'learning_rate': 0.08422524892255819, 'min_child_weight': 5, 'subsample': 0.75, 'colsample_bytree': 0.9, 'scale_pos_weight': 9.0, 'threshold': 0.4}. Best is trial 16 with value: 0.9769681663117025.


Best trial: 16. Best value: 0.976968:  77%|███████▋  | 23/30 [14:37<05:22, 46.14s/it]

[I 2025-12-01 01:30:55,749] Trial 22 finished with value: 0.9764803125939284 and parameters: {'max_depth': 7, 'n_estimators': 350, 'learning_rate': 0.08428404985669867, 'min_child_weight': 5, 'subsample': 0.75, 'colsample_bytree': 0.95, 'scale_pos_weight': 7.5, 'threshold': 0.45}. Best is trial 16 with value: 0.9769681663117025.


Best trial: 16. Best value: 0.976968:  80%|████████  | 24/30 [15:12<04:17, 42.86s/it]

[I 2025-12-01 01:31:30,953] Trial 23 finished with value: 0.9699524587728421 and parameters: {'max_depth': 6, 'n_estimators': 350, 'learning_rate': 0.047387180343858935, 'min_child_weight': 3, 'subsample': 0.75, 'colsample_bytree': 0.95, 'scale_pos_weight': 7.5, 'threshold': 0.45}. Best is trial 16 with value: 0.9769681663117025.


Best trial: 24. Best value: 0.977049:  83%|████████▎ | 25/30 [15:59<03:41, 44.21s/it]

[I 2025-12-01 01:32:18,337] Trial 24 finished with value: 0.9770493035845189 and parameters: {'max_depth': 7, 'n_estimators': 400, 'learning_rate': 0.0846751074136459, 'min_child_weight': 7, 'subsample': 0.7, 'colsample_bytree': 0.95, 'scale_pos_weight': 7.0, 'threshold': 0.45}. Best is trial 24 with value: 0.9770493035845189.


Best trial: 24. Best value: 0.977049:  87%|████████▋ | 26/30 [16:32<02:42, 40.68s/it]

[I 2025-12-01 01:32:50,761] Trial 25 finished with value: 0.9537388524619941 and parameters: {'max_depth': 5, 'n_estimators': 400, 'learning_rate': 0.03193144174213447, 'min_child_weight': 7, 'subsample': 0.65, 'colsample_bytree': 0.85, 'scale_pos_weight': 6.5, 'threshold': 0.35000000000000003}. Best is trial 24 with value: 0.9770493035845189.


Best trial: 24. Best value: 0.977049:  90%|█████████ | 27/30 [17:25<02:13, 44.40s/it]

[I 2025-12-01 01:33:43,844] Trial 26 finished with value: 0.9761801482375573 and parameters: {'max_depth': 8, 'n_estimators': 400, 'learning_rate': 0.08567253865304958, 'min_child_weight': 7, 'subsample': 0.7, 'colsample_bytree': 0.95, 'scale_pos_weight': 4.5, 'threshold': 0.45}. Best is trial 24 with value: 0.9770493035845189.


Best trial: 24. Best value: 0.977049:  93%|█████████▎| 28/30 [17:44<01:13, 36.79s/it]

[I 2025-12-01 01:34:02,869] Trial 27 finished with value: 0.9714099459862172 and parameters: {'max_depth': 5, 'n_estimators': 250, 'learning_rate': 0.15183422653050652, 'min_child_weight': 9, 'subsample': 0.65, 'colsample_bytree': 0.8, 'scale_pos_weight': 7.0, 'threshold': 0.4}. Best is trial 24 with value: 0.9770493035845189.


Best trial: 24. Best value: 0.977049:  97%|█████████▋| 29/30 [18:33<00:40, 40.50s/it]

[I 2025-12-01 01:34:52,022] Trial 28 finished with value: 0.9760556513912849 and parameters: {'max_depth': 7, 'n_estimators': 450, 'learning_rate': 0.059869800338550265, 'min_child_weight': 8, 'subsample': 0.85, 'colsample_bytree': 0.95, 'scale_pos_weight': 8.5, 'threshold': 0.45}. Best is trial 24 with value: 0.9770493035845189.


Best trial: 24. Best value: 0.977049: 100%|██████████| 30/30 [19:18<00:00, 38.61s/it]

[I 2025-12-01 01:35:36,815] Trial 29 finished with value: 0.9769107529921778 and parameters: {'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.1022268832486803, 'min_child_weight': 6, 'subsample': 0.7, 'colsample_bytree': 1.0, 'scale_pos_weight': 6.0, 'threshold': 0.5}. Best is trial 24 with value: 0.9770493035845189.
Best score: 0.9770
Recall: 0.9833
ROC-AUC: 0.9974
Precision: 0.9530
F1: 0.9679
Best params:
  max_depth: 7
  n_estimators: 400
  learning_rate: 0.0846751074136459
  min_child_weight: 7
  subsample: 0.7
  colsample_bytree: 0.95
  scale_pos_weight: 7.0
  threshold: 0.45





In [37]:
fig = optuna.visualization.plot_optimization_history(xgb_study)
fig.update_layout(width=800, height=500) 
fig.show()

fig = optuna.visualization.plot_param_importances(xgb_study)
fig.update_layout(width=800, height=500) 
fig.show()

## MLP Tuning

In [42]:
from re import X
from hybrid_mlp_model import HybridMLPClassifier

def mlp_objective(trial):
    params = {
        'feature_hidden_dim': trial.suggest_categorical('feature_hidden_dim', [16, 32, 64, 128]),
        'dropout': trial.suggest_float('dropout', 0.1, 0.5, step=0.1),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128]),
        'pos_weight': trial.suggest_float('pos_weight', 1.0, 10.0, step=0.5),
        'threshold': trial.suggest_float('threshold', 0.2, 0.5, step=0.05)
    }

    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = HybridMLPClassifier(
        embedding_dim=config["embedding_dim"],
        num_features=len(numeric_features),
        feature_hidden_dim=params['feature_hidden_dim'],
        dropout=params['dropout']
    ).to(device)
    
    train_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_train),
            torch.FloatTensor(features_train),
            torch.FloatTensor(y_train)
        ),
        batch_size=params['batch_size'],
        shuffle=True
    )
    
    inner_val_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_val),
            torch.FloatTensor(features_val),
            torch.FloatTensor(y_val)
        ),
        batch_size=params['batch_size'],
        shuffle=False
    )
    
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=params['lr'],
        weight_decay=params['weight_decay']
    )
    
    criterion = nn.BCEWithLogitsLoss(
        pos_weight=torch.tensor([params['pos_weight']], device=device)
    )
    
    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(30):
        model.train()
        for embeddings, features, labels in train_loader:
            embeddings, features, labels = embeddings.to(device), features.to(device), labels.to(device)
            
            optimizer.zero_grad()
            logits = model(embeddings, features).squeeze()
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for embeddings, features, labels in inner_val_loader:
                embeddings, features, labels = embeddings.to(device), features.to(device), labels.to(device)
                logits = model(embeddings, features).squeeze()
                val_loss += criterion(logits, labels).item()
        
        val_loss /= len(inner_val_loader)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        model.to(device)
    
    val_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_emb_val),
            torch.FloatTensor(features_val),
            torch.FloatTensor(y_val)
        ),
        batch_size=params['batch_size'],
        shuffle=False
    )
    
    model.eval()
    val_probs = []
    val_labels = []
    
    with torch.no_grad():
        for embeddings, features, labels in val_loader:
            embeddings, features = embeddings.to(device), features.to(device)
            logits = model(embeddings, features).squeeze()
            probs = torch.sigmoid(logits)
            
            if probs.dim() == 0:
                val_probs.append(probs.cpu().item())
                val_labels.append(labels.item())
            else:
                val_probs.extend(probs.cpu().numpy())
                val_labels.extend(labels.numpy())
    
    val_probs = np.array(val_probs)
    val_labels = np.array(val_labels)
    val_preds = (val_probs >= params['threshold']).astype(int)
    
    recall = recall_score(val_labels, val_preds, zero_division=0)
    precision = precision_score(val_labels, val_preds, zero_division=0)
    f1 = f1_score(val_labels, val_preds, zero_division=0)
    roc_auc = roc_auc_score(val_labels, val_probs)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('f1_score', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    
    if precision + recall > 0:
        F2 = (5 * precision * recall) / (4 * precision + recall)
    else:
        F2 = 0
    return F2

In [43]:
wandb_callback_mlp = WeightsAndBiasesCallback(
    metric_name="objective_score",
    wandb_kwargs={"project": "phishstop-detection", "tags": ["optuna", "mlp"]}
)

mlp_study = optuna.create_study(
    direction="maximize",
    study_name="mlp-tuning",
    sampler=optuna.samplers.TPESampler(seed=42)
)

mlp_study.optimize(
    mlp_objective,
    n_trials=config["n_trials_mlp"],
    callbacks=[wandb_callback_mlp],
    show_progress_bar=True
)

print(f"Best score: {mlp_study.best_trial.value:.4f}")
print(f"Recall: {mlp_study.best_trial.user_attrs['recall']:.4f}")
print(f"ROC-AUC: {mlp_study.best_trial.user_attrs['roc_auc']:.4f}")
print(f"Precision: {mlp_study.best_trial.user_attrs['precision']:.4f}")
print(f"F1: {mlp_study.best_trial.user_attrs['f1_score']:.4f}")
print("Best params:")
for k, v in mlp_study.best_trial.params.items():
    print(f"  {k}: {v}")


WeightsAndBiasesCallback is experimental (supported from v2.9.0). The interface can change in the future.



0,1
batch_size,▁▃█▁▁█▃▃▃▁██████████████████▁▃
dropout,▁▃▅▁█▃▆█▁█▅▅▅▆▆▆▆▆▆▃█▆▆▆█▆▅▆█▅
feature_hidden_dim,▂▁▂▄▄▄▂▄▁▄██████████▁███████▁▂
lr,▁▂▁▂█▂▁▂▃▆▁▁▁▁▁▂▁▄▁▁▁▁▂▁▃▂▃▁▁▁
objective_score,▆▇▇▇▁▅▇▆▇▂▇█▇██▇█▅█▇▇█▇▇▆█▇███
pos_weight,▁▃█▃▁▆▃▆▄▇▅▅▅▅▆▆▄▆▂▅█▄▅▇▆▂▅▄▄▃
threshold,█▃▇▄▃▂▁▄▇▄▇▇▇███▆█▆▆█▆█▆▇▃█▆▆▄
weight_decay,▁▁▂▁▂▁▁▁▁▂▆▇██▄▄▄▄▅▃▃▅▅█▅▂▃▆▅▃

0,1
batch_size,64.0
dropout,0.3
feature_hidden_dim,32.0
lr,0.00025
objective_score,0.98083
pos_weight,3.0
threshold,0.35
weight_decay,0.00034


[I 2025-12-01 02:36:49,796] A new study created in memory with name: mlp-tuning
Best trial: 0. Best value: 0.978797:   3%|▎         | 1/30 [02:06<1:00:54, 126.01s/it]

[I 2025-12-01 02:38:55,803] Trial 0 finished with value: 0.9787969839560067 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.1, 'lr': 0.00020511104188433984, 'weight_decay': 1.3066739238053272e-05, 'batch_size': 32, 'pos_weight': 1.0, 'threshold': 0.5}. Best is trial 0 with value: 0.9787969839560067.


Best trial: 0. Best value: 0.978797:   7%|▋         | 2/30 [02:48<35:58, 77.10s/it]   

[I 2025-12-01 02:39:38,666] Trial 1 finished with value: 0.977984837733876 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.2, 'lr': 0.0011207606211860567, 'weight_decay': 7.309539835912905e-05, 'batch_size': 64, 'pos_weight': 3.5, 'threshold': 0.30000000000000004}. Best is trial 0 with value: 0.9787969839560067.


Best trial: 2. Best value: 0.978883:  10%|█         | 3/30 [04:11<35:55, 79.83s/it]

[I 2025-12-01 02:41:01,738] Trial 2 finished with value: 0.9788829866826277 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.30000000000000004, 'lr': 0.0001238513729886094, 'weight_decay': 0.000164092867306479, 'batch_size': 128, 'pos_weight': 10.0, 'threshold': 0.45}. Best is trial 2 with value: 0.9788829866826277.


Best trial: 2. Best value: 0.978883:  13%|█▎        | 4/30 [05:36<35:25, 81.76s/it]

[I 2025-12-01 02:42:26,476] Trial 3 finished with value: 0.9785274350862083 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.1, 'lr': 0.0009780337016659412, 'weight_decay': 1.1715937392307055e-05, 'batch_size': 32, 'pos_weight': 3.5, 'threshold': 0.35000000000000003}. Best is trial 2 with value: 0.9788829866826277.


Best trial: 2. Best value: 0.978883:  17%|█▋        | 5/30 [06:46<32:19, 77.57s/it]

[I 2025-12-01 02:43:36,616] Trial 4 finished with value: 0.9621899859616284 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.006161049539380964, 'weight_decay': 0.0001569639638866114, 'batch_size': 32, 'pos_weight': 1.0, 'threshold': 0.30000000000000004}. Best is trial 2 with value: 0.9788829866826277.


Best trial: 2. Best value: 0.978883:  20%|██        | 6/30 [07:13<24:04, 60.17s/it]

[I 2025-12-01 02:44:03,018] Trial 5 finished with value: 0.974053673102246 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.2, 'lr': 0.001217284708112243, 'weight_decay': 1.913588048769229e-05, 'batch_size': 128, 'pos_weight': 8.0, 'threshold': 0.25}. Best is trial 2 with value: 0.9788829866826277.


Best trial: 2. Best value: 0.978883:  23%|██▎       | 7/30 [09:06<29:45, 77.61s/it]

[I 2025-12-01 02:45:56,525] Trial 6 finished with value: 0.9777989039257357 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.4, 'lr': 0.00014063366777718192, 'weight_decay': 5.211124595788268e-05, 'batch_size': 64, 'pos_weight': 4.0, 'threshold': 0.2}. Best is trial 2 with value: 0.9788829866826277.


Best trial: 2. Best value: 0.978883:  27%|██▋       | 8/30 [10:29<29:02, 79.21s/it]

[I 2025-12-01 02:47:19,155] Trial 7 finished with value: 0.9778614991513885 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.0008798929749689024, 'weight_decay': 1.7345566642360933e-05, 'batch_size': 64, 'pos_weight': 8.0, 'threshold': 0.35000000000000003}. Best is trial 2 with value: 0.9788829866826277.


Best trial: 8. Best value: 0.979459:  30%|███       | 9/30 [11:26<25:15, 72.17s/it]

[I 2025-12-01 02:48:15,861] Trial 8 finished with value: 0.979458730039733 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.1, 'lr': 0.0018742210985555703, 'weight_decay': 4.253162363790868e-05, 'batch_size': 64, 'pos_weight': 4.5, 'threshold': 0.45}. Best is trial 8 with value: 0.979458730039733.


Best trial: 8. Best value: 0.979459:  33%|███▎      | 10/30 [13:25<28:53, 86.67s/it]

[I 2025-12-01 02:50:15,001] Trial 9 finished with value: 0.9642444366985269 and parameters: {'feature_hidden_dim': 64, 'dropout': 0.5, 'lr': 0.004132765459466363, 'weight_decay': 0.00018484491720988634, 'batch_size': 32, 'pos_weight': 9.0, 'threshold': 0.35000000000000003}. Best is trial 8 with value: 0.979458730039733.


Best trial: 8. Best value: 0.979459:  37%|███▋      | 11/30 [14:33<25:38, 80.95s/it]

[I 2025-12-01 02:51:22,974] Trial 10 finished with value: 0.9734151329243352 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.2, 'lr': 0.002467377537662446, 'weight_decay': 0.0007321173243252594, 'batch_size': 64, 'pos_weight': 6.0, 'threshold': 0.45}. Best is trial 8 with value: 0.979458730039733.


Best trial: 11. Best value: 0.980598:  40%|████      | 12/30 [15:40<23:00, 76.68s/it]

[I 2025-12-01 02:52:29,879] Trial 11 finished with value: 0.9805977975878343 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.30000000000000004, 'lr': 0.0003528543537143545, 'weight_decay': 0.0003766868457380084, 'batch_size': 128, 'pos_weight': 6.0, 'threshold': 0.45}. Best is trial 11 with value: 0.9805977975878343.


Best trial: 11. Best value: 0.980598:  43%|████▎     | 13/30 [17:10<22:55, 80.92s/it]

[I 2025-12-01 02:54:00,576] Trial 12 finished with value: 0.9790190175215513 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.30000000000000004, 'lr': 0.000435494315175931, 'weight_decay': 0.0007300593718853121, 'batch_size': 128, 'pos_weight': 6.0, 'threshold': 0.45}. Best is trial 11 with value: 0.9805977975878343.


Best trial: 13. Best value: 0.980616:  47%|████▋     | 14/30 [18:09<19:49, 74.33s/it]

[I 2025-12-01 02:54:59,657] Trial 13 finished with value: 0.9806163767246551 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.00035911238326165643, 'weight_decay': 0.00036566554029085154, 'batch_size': 128, 'pos_weight': 5.0, 'threshold': 0.5}. Best is trial 13 with value: 0.9806163767246551.


Best trial: 14. Best value: 0.981737:  50%|█████     | 15/30 [19:39<19:44, 78.99s/it]

[I 2025-12-01 02:56:29,465] Trial 14 finished with value: 0.9817373024512933 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0003441353757551813, 'weight_decay': 0.00039841599348177986, 'batch_size': 128, 'pos_weight': 7.0, 'threshold': 0.5}. Best is trial 14 with value: 0.9817373024512933.


Best trial: 14. Best value: 0.981737:  53%|█████▎    | 16/30 [20:30<16:26, 70.46s/it]

[I 2025-12-01 02:57:20,093] Trial 15 finished with value: 0.9788028760855357 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0004044758550598607, 'weight_decay': 0.00040902917446293687, 'batch_size': 128, 'pos_weight': 7.5, 'threshold': 0.5}. Best is trial 14 with value: 0.9817373024512933.


Best trial: 14. Best value: 0.981737:  57%|█████▋    | 17/30 [21:39<15:12, 70.17s/it]

[I 2025-12-01 02:58:29,601] Trial 16 finished with value: 0.9788898014036136 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0002798444336325415, 'weight_decay': 0.0003284333378375951, 'batch_size': 128, 'pos_weight': 7.0, 'threshold': 0.4}. Best is trial 14 with value: 0.9817373024512933.


Best trial: 14. Best value: 0.981737:  60%|██████    | 18/30 [23:06<15:01, 75.13s/it]

[I 2025-12-01 02:59:56,292] Trial 17 finished with value: 0.9793775778027746 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0005295508228149763, 'weight_decay': 0.0009925953633787147, 'batch_size': 128, 'pos_weight': 5.0, 'threshold': 0.5}. Best is trial 14 with value: 0.9817373024512933.


Best trial: 14. Best value: 0.981737:  63%|██████▎   | 19/30 [24:16<13:30, 73.68s/it]

[I 2025-12-01 03:01:06,594] Trial 18 finished with value: 0.9816187537627936 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0006300375551641955, 'weight_decay': 0.00025458030290882575, 'batch_size': 128, 'pos_weight': 2.5, 'threshold': 0.4}. Best is trial 14 with value: 0.9817373024512933.


Best trial: 19. Best value: 0.981946:  67%|██████▋   | 20/30 [25:52<13:24, 80.40s/it]

[I 2025-12-01 03:02:42,650] Trial 19 finished with value: 0.9819457807459393 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.0005957412292147004, 'weight_decay': 0.00023686574806450268, 'batch_size': 128, 'pos_weight': 2.0, 'threshold': 0.4}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  70%|███████   | 21/30 [27:24<12:34, 83.86s/it]

[I 2025-12-01 03:04:14,561] Trial 20 finished with value: 0.9819209039548022 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.00022965237156074138, 'weight_decay': 0.00010083823242381911, 'batch_size': 128, 'pos_weight': 2.5, 'threshold': 0.4}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  73%|███████▎  | 22/30 [28:56<11:28, 86.12s/it]

[I 2025-12-01 03:05:45,960] Trial 21 finished with value: 0.9815985549785504 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.00020519670353929582, 'weight_decay': 0.00011485095144920027, 'batch_size': 128, 'pos_weight': 3.0, 'threshold': 0.4}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  77%|███████▋  | 23/30 [30:10<09:38, 82.61s/it]

[I 2025-12-01 03:07:00,391] Trial 22 finished with value: 0.9807146482208983 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.0002171190925073822, 'weight_decay': 9.59748818172746e-05, 'batch_size': 128, 'pos_weight': 2.5, 'threshold': 0.4}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  80%|████████  | 24/30 [31:41<08:30, 85.13s/it]

[I 2025-12-01 03:08:31,394] Trial 23 finished with value: 0.9791463323072301 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.00010183214974222891, 'weight_decay': 0.00022469870960372876, 'batch_size': 128, 'pos_weight': 2.0, 'threshold': 0.30000000000000004}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  83%|████████▎ | 25/30 [32:59<06:55, 83.04s/it]

[I 2025-12-01 03:09:49,558] Trial 24 finished with value: 0.9773720702389027 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.0007847950900043854, 'weight_decay': 0.0005676076056081949, 'batch_size': 128, 'pos_weight': 1.5, 'threshold': 0.4}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  87%|████████▋ | 26/30 [34:17<05:25, 81.32s/it]

[I 2025-12-01 03:11:06,870] Trial 25 finished with value: 0.9789485471138235 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.00016269164303798984, 'weight_decay': 0.00011130178370965616, 'batch_size': 128, 'pos_weight': 7.0, 'threshold': 0.35000000000000003}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  90%|█████████ | 27/30 [35:28<03:54, 78.22s/it]

[I 2025-12-01 03:12:17,863] Trial 26 finished with value: 0.9810668020380919 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.5, 'lr': 0.000277225614076926, 'weight_decay': 3.3720942817476375e-05, 'batch_size': 128, 'pos_weight': 2.0, 'threshold': 0.25}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  93%|█████████▎| 28/30 [36:12<02:16, 68.16s/it]

[I 2025-12-01 03:13:02,545] Trial 27 finished with value: 0.980467721923525 and parameters: {'feature_hidden_dim': 128, 'dropout': 0.4, 'lr': 0.0005959942777273677, 'weight_decay': 8.100407637142324e-05, 'batch_size': 128, 'pos_weight': 4.0, 'threshold': 0.45}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946:  97%|█████████▋| 29/30 [37:07<01:04, 64.24s/it]

[I 2025-12-01 03:13:57,646] Trial 28 finished with value: 0.9792118447618336 and parameters: {'feature_hidden_dim': 16, 'dropout': 0.5, 'lr': 0.0018836929731922672, 'weight_decay': 0.0002626045912808322, 'batch_size': 128, 'pos_weight': 3.0, 'threshold': 0.4}. Best is trial 19 with value: 0.9819457807459393.


Best trial: 19. Best value: 0.981946: 100%|██████████| 30/30 [40:24<00:00, 80.82s/it] 

[I 2025-12-01 03:17:14,329] Trial 29 finished with value: 0.9777106157914619 and parameters: {'feature_hidden_dim': 32, 'dropout': 0.4, 'lr': 0.0002452510912989698, 'weight_decay': 0.0005475202356640898, 'batch_size': 32, 'pos_weight': 1.5, 'threshold': 0.5}. Best is trial 19 with value: 0.9819457807459393.
Best score: 0.9819
Recall: 0.9846
ROC-AUC: 0.9982
Precision: 0.9715
F1: 0.9780
Best params:
  feature_hidden_dim: 128
  dropout: 0.5
  lr: 0.0005957412292147004
  weight_decay: 0.00023686574806450268
  batch_size: 128
  pos_weight: 2.0
  threshold: 0.4





In [44]:
fig = optuna.visualization.plot_optimization_history(mlp_study)
fig.update_layout(width=800, height=500) 
fig.show()

fig = optuna.visualization.plot_param_importances(mlp_study)
fig.update_layout(width=800, height=500) 
fig.show()

## Save Results

In [45]:
results = {
    'tfidf': {
        'best_score': tfidf_study.best_trial.value,
        'recall': tfidf_study.best_trial.user_attrs['recall'],
        'roc_auc': tfidf_study.best_trial.user_attrs['roc_auc'],
        'precision': tfidf_study.best_trial.user_attrs['precision'],
        'f1_score': tfidf_study.best_trial.user_attrs['f1_score'],
        'best_params': tfidf_study.best_trial.params
    },
    'xgboost': {
        'best_score': xgb_study.best_trial.value,
        'recall': xgb_study.best_trial.user_attrs['recall'],
        'roc_auc': xgb_study.best_trial.user_attrs['roc_auc'],
        'precision': xgb_study.best_trial.user_attrs['precision'],
        'f1_score': xgb_study.best_trial.user_attrs['f1_score'],
        'best_params': xgb_study.best_trial.params
    },
    'mlp': {
        'best_score': mlp_study.best_trial.value,
        'recall': mlp_study.best_trial.user_attrs['recall'],
        'roc_auc': mlp_study.best_trial.user_attrs['roc_auc'],
        'precision': mlp_study.best_trial.user_attrs['precision'],
        'f1_score': mlp_study.best_trial.user_attrs['f1_score'],
        'best_params': mlp_study.best_trial.params
    }
}

os.makedirs("../../output/tuning_results", exist_ok=True)

with open('../../output/tuning_results/optuna_results.json', 'w') as f:
    json.dump(results, f, indent=2)

for model, res in results.items():
    print(f"{model.upper()}:")
    print(f"Score F2 = (5 * precision * recall) / (4 * precision + recall): {res['best_score']:.4f}")
    print(f"Recall: {res['recall']:.4f}")
    print(f"ROC-AUC: {res['roc_auc']:.4f}")
    print(f"Precision: {res['precision']:.4f}")
    print(f"F1 Score: {res['f1_score']:.4f}")

TFIDF:
Score F2 = (5 * precision * recall) / (4 * precision + recall): 0.9790
Recall: 0.9925
ROC-AUC: 0.9973
Precision: 0.9285
F1 Score: 0.9595
XGBOOST:
Score F2 = (5 * precision * recall) / (4 * precision + recall): 0.9770
Recall: 0.9833
ROC-AUC: 0.9974
Precision: 0.9530
F1 Score: 0.9679
MLP:
Score F2 = (5 * precision * recall) / (4 * precision + recall): 0.9819
Recall: 0.9846
ROC-AUC: 0.9982
Precision: 0.9715
F1 Score: 0.9780
