# Phishing URL Tree-Based Model Experiments

This notebook explores various tree-based models using the Kaggle phishing URL dataset.

For the tree-based models, we will be experimenting with:

1. Random Forest
2. XGBoost
3. LightGBM
4. CatBoost

## Setup and Imports

In [8]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Tree-based models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Import ModelSaver
import sys
import os
sys.path.append(os.path.abspath('.'))
from save_model import ModelSaver

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [9]:
# Configuration
SAVE_MODELS = True
SEED = 42
np.random.seed(SEED)

# Check for Google Drive (if running in Colab)
use_drive = False
try:
    from google.colab import drive
    drive.mount('/content/drive')
    use_drive = True
    drive_root = '/content/drive/MyDrive/fraud-grp-proj/'
except ImportError:
    pass

In [10]:
# Load train and test datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

print(f"Train with features shape: {train_w_features_df.shape}")
print(f"Test with features shape: {test_w_features_df.shape}")

# Prepare text data for TF-IDF
X_text = train_df['url'].values
X_text_test = test_df['url'].values

Train shape: (9143, 2)
Test shape: (2286, 2)
Train with features shape: (9143, 78)
Test with features shape: (2286, 78)


Due to the robust nature of tree-based models, we will just be using the full feature set including originals and transformed features, unlike our approach for linear and neural network models.

In [11]:
# Prepare X and y
non_text_cols = train_w_features_df.select_dtypes(exclude=[object]).columns.tolist()
if 'target' in non_text_cols:
    non_text_cols.remove('target')

# We will use the DataFrames directly
X_train_df = train_w_features_df.copy()
y_train = train_w_features_df['target'].values

X_test_df = test_w_features_df.copy()
if 'target' in test_w_features_df.columns:
    y_test = test_w_features_df['target'].values
else:
    y_test = np.zeros(len(test_w_features_df))

print(f"Numeric features: {len(non_text_cols)}")
print(f"Total features (including url): {len(non_text_cols) + 1}")


Numeric features: 72
Total features (including url): 73


## Training Models

Now lets move on to training the models. We use the `ModelSaver` utility to help us standardize the storing of metrics and models for evaluation later on.

Since we found that combined features worked best for linear models, we will focus on combined features (TF-IDF + Numeric) for tree-based models as well. That said, to help with performance, we will perform SVD on the TF-IDF features to reduce dimensionality before combining with numeric features.

Ultimately, we will be experimenting with:
1. Numeric features only
2. Combined features (TF-IDF + SVD + Numeric)

In [None]:
def run_tree_experiment(model_class, model_name, model_params, experiment_name, X_train, y_train, X_test, numeric_features, text_feature=None, save_model=True, n_svd_components=100, tfidf_max_features=5000, tfidf_ngram_range=(3,5), **kwargs):
    print(f"\n=== Running Experiment: {experiment_name} ({model_name}) ===")
    print(f"Saving Model: {save_model}")

    saver = None
    if save_model:
        if use_drive:
            base_path = drive_root + "experiments"
        else:
            base_path = "experiments"
        saver = ModelSaver(base_path=base_path)
        saver.start_experiment(
            experiment_name=experiment_name,
            model_type=model_name,
            vectorizer="Tfidf+SVD" if text_feature else "Numeric",
            vectorizer_params={'max_features': tfidf_max_features, 'ngram_range': tfidf_ngram_range, 'n_components': n_svd_components} if text_feature else {},
            model_params=model_params,
            n_folds=5,
            save_format="pickle"
        )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    fold_test_preds = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), start=1):
        print(f"\n--- Fold {fold}/5 ---")
        
        # Split data
        X_train_fold = X_train.iloc[train_idx]
        y_train_fold = y_train[train_idx]
        X_val_fold = X_train.iloc[val_idx]
        y_val_fold = y_train[val_idx]

        transformers = []
        feature_names_out = []

        # 1. Text Pipeline (TF-IDF + SVD)
        if text_feature:
            text_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features=tfidf_max_features, analyzer='char', ngram_range=tfidf_ngram_range)),
                ('svd', TruncatedSVD(n_components=n_svd_components, random_state=SEED))
            ])
            transformers.append(('text', text_pipeline, text_feature))
            # Generate feature names for SVD components
            feature_names_out.extend([f'svd_{i}' for i in range(n_svd_components)])

        # 2. Numeric Pipeline
        if numeric_features:
            transformers.append(('numeric', 'passthrough', numeric_features))
            feature_names_out.extend(numeric_features)

        # Column Transformer
        preprocessor = ColumnTransformer(transformers)

        # 3. Full Pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model_class(**model_params))
        ])

        # Train
        pipeline.fit(X_train_fold, y_train_fold)

        # Validation predictions
        val_probs = pipeline.predict_proba(X_val_fold)[:, 1]
        val_preds = (val_probs > 0.5).astype(int)

        # Calculate metrics
        tn, fp, fn, tp = confusion_matrix(y_val_fold, val_preds).ravel()
        
        metrics = {
            'fold': fold,
            'accuracy': accuracy_score(y_val_fold, val_preds),
            'precision': precision_score(y_val_fold, val_preds, zero_division=0),
            'recall': recall_score(y_val_fold, val_preds, zero_division=0),
            'f1': f1_score(y_val_fold, val_preds, zero_division=0),
            'roc_auc': roc_auc_score(y_val_fold, val_probs),
            'TP': int(tp),
            'FP': int(fp),
            'TN': int(tn),
            'FN': int(fn),
            'train_size': len(train_idx),
            'val_size': len(val_idx)
        }
        
        print(f"Fold {fold} Val AUC: {metrics['roc_auc']:.4f}")

        # Test predictions
        test_probs = pipeline.predict_proba(X_test)[:, 1]
        fold_test_preds.append(test_probs)

        if save_model and saver:
            saver.add_fold(
                fold_model=pipeline,
                fold_metric=metrics,
                test_predictions=test_probs,
                feature_names=feature_names_out
            )

    if save_model and saver:
        saver.finalize_experiment(**kwargs)
        print(f"Experiment saved to {saver._exp_dir}")

    return pipeline


### 1. Random Forest

#### 1.1. Numeric Features

In [13]:
# Default params
rf_params = {
    'random_state': SEED,
    'verbose': 0
}

run_tree_experiment(
    RandomForestClassifier, 
    "RandomForest", 
    rf_params, 
    "exp_2_random_forest_numeric", 
    X_train=X_train_df, 
    y_train=y_train, 
    X_test=X_test_df, 
    numeric_features=non_text_cols,
    text_feature=None, 
    save_model=SAVE_MODELS
)


=== Running Experiment: exp_2_random_forest_numeric (RandomForest) ===
Saving Model: True
Experiment 'exp_2_random_forest_numeric' initialized at: experiments/exp_2_random_forest_numeric
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9739
  Fold 1/5 saved | ROC AUC: 0.9739

--- Fold 2/5 ---
Fold 1 Val AUC: 0.9739
  Fold 1/5 saved | ROC AUC: 0.9739

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9716
  Fold 2/5 saved | ROC AUC: 0.9716

--- Fold 3/5 ---
Fold 2 Val AUC: 0.9716
  Fold 2/5 saved | ROC AUC: 0.9716

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9705
  Fold 3/5 saved | ROC AUC: 0.9705

--- Fold 4/5 ---
Fold 3 Val AUC: 0.9705
  Fold 3/5 saved | ROC AUC: 0.9705

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9700
  Fold 4/5 saved | ROC AUC: 0.9700

--- Fold 5/5 ---
Fold 4 Val AUC: 0.9700
  Fold 4/5 saved | ROC AUC: 0.9700

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9707
  Fold 5/5 saved | ROC AUC: 0.9707

Finalizing experiment...
  Predictions saved to experiments/exp_2_random_forest_numeric/exp_

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


#### 1.2. TF-IDF + SVD + Engineered Features

In [14]:
run_tree_experiment(
    RandomForestClassifier, 
    "RandomForest", 
    rf_params, 
    "exp_2_random_forest_all", 
    X_train=X_train_df, 
    y_train=y_train, 
    X_test=X_test_df, 
    numeric_features=non_text_cols, 
    text_feature='url', 
    save_model=SAVE_MODELS
)


=== Running Experiment: exp_2_random_forest_all (RandomForest) ===
Saving Model: True
Experiment 'exp_2_random_forest_all' initialized at: experiments/exp_2_random_forest_all
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9822
Fold 1 Val AUC: 0.9822
  Fold 1/5 saved | ROC AUC: 0.9822

--- Fold 2/5 ---
  Fold 1/5 saved | ROC AUC: 0.9822

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9794
  Fold 2/5 saved | ROC AUC: 0.9794

--- Fold 3/5 ---
Fold 2 Val AUC: 0.9794
  Fold 2/5 saved | ROC AUC: 0.9794

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9809
  Fold 3/5 saved | ROC AUC: 0.9809

--- Fold 4/5 ---
Fold 3 Val AUC: 0.9809
  Fold 3/5 saved | ROC AUC: 0.9809

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9838
  Fold 4/5 saved | ROC AUC: 0.9838

--- Fold 5/5 ---
Fold 4 Val AUC: 0.9838
  Fold 4/5 saved | ROC AUC: 0.9838

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9784
  Fold 5/5 saved | ROC AUC: 0.9784

Finalizing experiment...
  Predictions saved to experiments/exp_2_random_forest_all/exp_2_random_forest_

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'char'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,100
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42
,tol,0.0

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Since using combined features worked best for our baseline random forest, we will try using it for the rest of the tree-based models as well.

### 2. XGBoost

In [15]:
# Default XGBoost params
xgb_params = {
    'random_state': SEED,
    'verbosity': 0
}

run_tree_experiment(
    XGBClassifier, 
    "XGBoost", 
    xgb_params, 
    "exp_2_xgboost_all", 
    X_train=X_train_df, 
    y_train=y_train, 
    X_test=X_test_df, 
    numeric_features=non_text_cols, 
    text_feature='url', 
    save_model=SAVE_MODELS
)


=== Running Experiment: exp_2_xgboost_all (XGBoost) ===
Saving Model: True
Experiment 'exp_2_xgboost_all' initialized at: experiments/exp_2_xgboost_all
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9870
  Fold 1/5 saved | ROC AUC: 0.9870

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9837
  Fold 2/5 saved | ROC AUC: 0.9837

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9833
  Fold 3/5 saved | ROC AUC: 0.9833

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9858
  Fold 4/5 saved | ROC AUC: 0.9858

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9818
  Fold 5/5 saved | ROC AUC: 0.9818

Finalizing experiment...
  Predictions saved to experiments/exp_2_xgboost_all/exp_2_xgboost_all_prediction.csv

✓ Experiment 'exp_2_xgboost_all' finalized!
  Location: experiments/exp_2_xgboost_all
  Folds completed: 5
  Best fold: 1 (ROC AUC: 0.9870)
  Average ROC AUC: 0.9843 ± 0.0019
Experiment saved to experiments/exp_2_xgboost_all


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'char'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,100
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42
,tol,0.0

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


### 3. LightGBM

In [16]:
# Default LightGBM params
lgbm_params = {
    'random_state': SEED,
    'verbose': -1
}

run_tree_experiment(
    LGBMClassifier, 
    "LightGBM", 
    lgbm_params, 
    "exp_2_lgbm_all", 
    X_train=X_train_df, 
    y_train=y_train, 
    X_test=X_test_df, 
    numeric_features=non_text_cols, 
    text_feature='url', 
    save_model=SAVE_MODELS
)


=== Running Experiment: exp_2_lgbm_all (LightGBM) ===
Saving Model: True
Experiment 'exp_2_lgbm_all' initialized at: experiments/exp_2_lgbm_all
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9855
  Fold 1/5 saved | ROC AUC: 0.9855

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9840
  Fold 2/5 saved | ROC AUC: 0.9840

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9820
  Fold 3/5 saved | ROC AUC: 0.9820

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9841
  Fold 4/5 saved | ROC AUC: 0.9841

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9817
  Fold 5/5 saved | ROC AUC: 0.9817

Finalizing experiment...
  Predictions saved to experiments/exp_2_lgbm_all/exp_2_lgbm_all_prediction.csv

✓ Experiment 'exp_2_lgbm_all' finalized!
  Location: experiments/exp_2_lgbm_all
  Folds completed: 5
  Best fold: 1 (ROC AUC: 0.9855)
  Average ROC AUC: 0.9834 ± 0.0014
Experiment saved to experiments/exp_2_lgbm_all


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'char'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,100
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42
,tol,0.0

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


### 4. CatBoost

In [17]:
# Default CatBoost params
catboost_params = {
    'random_state': SEED,
    'verbose': 0
}

run_tree_experiment(
    CatBoostClassifier, 
    "CatBoost", 
    catboost_params, 
    "exp_2_catboost_all", 
    X_train=X_train_df, 
    y_train=y_train, 
    X_test=X_test_df, 
    numeric_features=non_text_cols, 
    text_feature='url', 
    save_model=SAVE_MODELS
)


=== Running Experiment: exp_2_catboost_all (CatBoost) ===
Saving Model: True
Experiment 'exp_2_catboost_all' initialized at: experiments/exp_2_catboost_all
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9862
  Fold 1/5 saved | ROC AUC: 0.9862

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9857
  Fold 2/5 saved | ROC AUC: 0.9857

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9843
  Fold 3/5 saved | ROC AUC: 0.9843

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9878
  Fold 4/5 saved | ROC AUC: 0.9878

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9829
  Fold 5/5 saved | ROC AUC: 0.9829

Finalizing experiment...
  Predictions saved to experiments/exp_2_catboost_all/exp_2_catboost_all_prediction.csv

✓ Experiment 'exp_2_catboost_all' finalized!
  Location: experiments/exp_2_catboost_all
  Folds completed: 5
  Best fold: 4 (ROC AUC: 0.9878)
  Average ROC AUC: 0.9854 ± 0.0017
Experiment saved to experiments/exp_2_catboost_all


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'char'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,100
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42
,tol,0.0


In [18]:
# Pre-calculate features for Optuna to speed up tuning
print("Pre-calculating features for Optuna...")

# 1. Text Features (TF-IDF + SVD)
tfidf = TfidfVectorizer(max_features=5000, analyzer='char', ngram_range=(3, 5))
svd = TruncatedSVD(n_components=100, random_state=SEED)

X_text_tfidf = tfidf.fit_transform(X_train_df['url'])
X_text_svd = svd.fit_transform(X_text_tfidf)

# 2. Numeric Features
X_numeric = X_train_df[non_text_cols].values

# 3. Combine
X_combined = np.hstack([X_text_svd, X_numeric])
y = y_train

print(f"Combined features shape: {X_combined.shape}")

Pre-calculating features for Optuna...
Combined features shape: (9143, 172)


## Optuna Hyperparameter Tuning

Now we can perform hyperparameter tuning using Optuna for the best tree-based model, CatBoost (on numeric features).

In [None]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.exceptions import TrialPruned

# Caches for TF-IDF and SVD transforms so repeated combinations don't recompute
# across Optuna trials. This is an in-memory cache and will persist while the
# kernel/notebook session is active.
tfidf_cache = {}
svd_cache = {}

print(f"Optuna version: {optuna.__version__}")


Optuna version: 4.6.0


In [None]:
def objective(trial):
    # -------------------------
    # Hyperparameter Search Space
    # -------------------------
    params = {
        'iterations': trial.suggest_int('iterations', 300, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),

        # Depth (CatBoost supports depth 1–16 but 4–12 is typically optimal)
        'depth': trial.suggest_int('depth', 3, 12),

        # L2 regularization
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-6, 200.0, log=True),

        # Bootstrap
        'bootstrap_type': trial.suggest_categorical(
            'bootstrap_type', 
            ['Bayesian', 'Bernoulli', 'MVS']
        ),

        # Feature bagging / randomness
        'random_strength': trial.suggest_float('random_strength', 1e-8, 50.0, log=True),

        # Leaf estimation
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),

        # Growing policy
        'grow_policy': trial.suggest_categorical(
            'grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']
        ),

        # Other CatBoost settings
        'task_type': 'CPU',
        'eval_metric': 'AUC',
        'use_best_model': True,
        'random_seed': SEED,
        'verbose': False
    }

    # If Bayesian bootstrap → bagging_temperature is meaningful
    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 10.0)
    elif params['bootstrap_type'] == 'Bernoulli':
        params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)

    # -------------------------
    # TF-IDF & SVD Search Space
    # -------------------------
    # Number of TF-IDF features (chars) and n-gram range
    max_features = trial.suggest_categorical("max_features", [1000, 5000, 10000])

    # Choose min ngram size then choose max ngram size >= min
    ngram_min = trial.suggest_int("ngram_min", 1, 3)
    ngram_max = trial.suggest_int("ngram_max", ngram_min, 5)
    ngram_range = (ngram_min, ngram_max)

    # TruncatedSVD components for dimensionality reduction
    n_svd_components = trial.suggest_int("n_svd_components", 25, 200)

    # -------------------------
    # Cross-Validation
    # -------------------------
    # Use caching for TF-IDF and SVD transforms to avoid recomputation across
    # trials of the same hyperparameter combinations.
    tfidf_key = (max_features, ngram_range)
    if tfidf_key in tfidf_cache:
        X_text_tfidf_trial = tfidf_cache[tfidf_key]
    else:
        tfidf_trial = TfidfVectorizer(max_features=max_features, analyzer='char', ngram_range=ngram_range)
        X_text_tfidf_trial = tfidf_trial.fit_transform(X_train_df['url'])
        tfidf_cache[tfidf_key] = X_text_tfidf_trial

    svd_key = (max_features, ngram_range, n_svd_components)
    if svd_key in svd_cache:
        X_text_svd_trial = svd_cache[svd_key]
    else:
        svd_trial = TruncatedSVD(n_components=n_svd_components, random_state=SEED)
        X_text_svd_trial = svd_trial.fit_transform(X_text_tfidf_trial)
        svd_cache[svd_key] = X_text_svd_trial

    X_numeric_trial = X_train_df[non_text_cols].values

    X_combined_trial = np.hstack([X_text_svd_trial, X_numeric_trial])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    cv_scores = []

    for train_idx, val_idx in skf.split(X_combined_trial, y):
        X_train, X_val = X_combined_trial[train_idx], X_combined_trial[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = CatBoostClassifier(**params)

        model.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            early_stopping_rounds=100,
            verbose=False
        )

        val_probs = model.predict_proba(X_val)[:, 1]
        roc_auc = roc_auc_score(y_val, val_probs)
        cv_scores.append(roc_auc)

        # Tell Optuna the fold's intermediate value for pruning
        trial.report(roc_auc, step=len(cv_scores))

        if trial.should_prune():
            raise optuna.TrialPruned()

    return np.mean(cv_scores)


In [25]:
study = optuna.create_study(direction='maximize',
                            sampler=TPESampler(seed=SEED),
                            pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=5))
study.optimize(objective, n_trials=60, show_progress_bar=True)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2025-11-21 13:07:55,427] A new study created in memory with name: no-name-64b09527-0c60-4e8b-a898-e64a471808ad
Best trial: 0. Best value: 0.979636:   2%|▏         | 1/60 [01:09<1:08:34, 69.74s/it]

[I 2025-11-21 13:09:05,175] Trial 0 finished with value: 0.9796362098304767 and parameters: {'iterations': 937, 'learning_rate': 0.20218499516556737, 'depth': 10, 'l2_leaf_reg': 0.09321419094969498, 'bootstrap_type': 'Bayesian', 'random_strength': 2.517772329704955, 'leaf_estimation_iterations': 7, 'grow_policy': 'Lossguide', 'bagging_temperature': 8.324426408004218}. Best is trial 0 with value: 0.9796362098304767.


Best trial: 0. Best value: 0.979636:   3%|▎         | 2/60 [01:17<32:22, 33.50s/it]  

[I 2025-11-21 13:09:13,307] Trial 1 finished with value: 0.9485928838120212 and parameters: {'iterations': 661, 'learning_rate': 0.00042877302789935055, 'depth': 4, 'l2_leaf_reg': 0.000335369825388289, 'bootstrap_type': 'Bayesian', 'random_strength': 0.008596919754886988, 'leaf_estimation_iterations': 2, 'grow_policy': 'Lossguide', 'bagging_temperature': 7.851759613930136}. Best is trial 0 with value: 0.9796362098304767.


Best trial: 0. Best value: 0.979636:   5%|▌         | 3/60 [01:53<32:52, 34.60s/it]

[I 2025-11-21 13:09:49,217] Trial 2 finished with value: 0.9684219089761481 and parameters: {'iterations': 639, 'learning_rate': 0.006138404389007993, 'depth': 8, 'l2_leaf_reg': 2.4298880728901746e-06, 'bootstrap_type': 'Bayesian', 'random_strength': 15.966596985879704, 'leaf_estimation_iterations': 10, 'grow_policy': 'SymmetricTree', 'bagging_temperature': 6.842330265121569}. Best is trial 0 with value: 0.9796362098304767.


Best trial: 0. Best value: 0.979636:   7%|▋         | 4/60 [02:18<28:35, 30.63s/it]

[I 2025-11-21 13:10:13,766] Trial 3 finished with value: 0.9572752115439276 and parameters: {'iterations': 1048, 'learning_rate': 0.0002656695256115113, 'depth': 7, 'l2_leaf_reg': 1.9295682537564475e-06, 'bootstrap_type': 'Bayesian', 'random_strength': 1.0550581023438787e-05, 'leaf_estimation_iterations': 6, 'grow_policy': 'Lossguide', 'bagging_temperature': 7.7513282336111455}. Best is trial 0 with value: 0.9796362098304767.


Best trial: 4. Best value: 0.984826:   8%|▊         | 5/60 [04:55<1:09:45, 76.10s/it]

[I 2025-11-21 13:12:50,472] Trial 4 finished with value: 0.9848262670075094 and parameters: {'iterations': 1898, 'learning_rate': 0.12924781191008258, 'depth': 8, 'l2_leaf_reg': 44.92681975270397, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.4301089334980203e-05, 'leaf_estimation_iterations': 4, 'grow_policy': 'Depthwise', 'subsample': 0.6404672548436904}. Best is trial 4 with value: 0.9848262670075094.


Best trial: 4. Best value: 0.984826:  10%|█         | 6/60 [05:14<51:07, 56.80s/it]  

[I 2025-11-21 13:13:09,819] Trial 5 finished with value: 0.9381186714840151 and parameters: {'iterations': 1223, 'learning_rate': 0.0003090370987350513, 'depth': 11, 'l2_leaf_reg': 4.157642367177134e-06, 'bootstrap_type': 'Bayesian', 'random_strength': 1.131250667164831e-08, 'leaf_estimation_iterations': 9, 'grow_policy': 'Lossguide', 'bagging_temperature': 0.7404465173409036}. Best is trial 4 with value: 0.9848262670075094.


Best trial: 4. Best value: 0.984826:  12%|█▏        | 7/60 [12:28<2:39:14, 180.28s/it]

[I 2025-11-21 13:20:24,327] Trial 6 finished with value: 0.9710951568088504 and parameters: {'iterations': 909, 'learning_rate': 0.0002528661927134377, 'depth': 11, 'l2_leaf_reg': 0.14928509438211213, 'bootstrap_type': 'Bayesian', 'random_strength': 1.4254214381700476e-05, 'leaf_estimation_iterations': 8, 'grow_policy': 'Depthwise', 'bagging_temperature': 1.195942459383017}. Best is trial 4 with value: 0.9848262670075094.


Best trial: 7. Best value: 0.985631:  13%|█▎        | 8/60 [14:15<2:15:57, 156.87s/it]

[I 2025-11-21 13:22:11,059] Trial 7 finished with value: 0.9856314038377455 and parameters: {'iterations': 1513, 'learning_rate': 0.0441918174422801, 'depth': 8, 'l2_leaf_reg': 2.5108553076892166, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.7641655936744645e-08, 'leaf_estimation_iterations': 2, 'grow_policy': 'Depthwise', 'subsample': 0.7542853455823514}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  15%|█▌        | 9/60 [15:29<1:51:13, 130.85s/it]

[I 2025-11-21 13:23:24,703] Trial 8 finished with value: 0.959956857867374 and parameters: {'iterations': 1843, 'learning_rate': 0.0007359008568679796, 'depth': 7, 'l2_leaf_reg': 1.870047613518446, 'bootstrap_type': 'MVS', 'random_strength': 3.6616803473194845e-07, 'leaf_estimation_iterations': 10, 'grow_policy': 'Lossguide'}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  17%|█▋        | 10/60 [16:22<1:29:10, 107.00s/it]

[I 2025-11-21 13:24:18,298] Trial 9 finished with value: 0.9463089268328279 and parameters: {'iterations': 1667, 'learning_rate': 0.00044537590438912286, 'depth': 11, 'l2_leaf_reg': 0.029998264362773617, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.1678587390897415e-07, 'leaf_estimation_iterations': 3, 'grow_policy': 'Lossguide', 'subsample': 0.5034760652655954}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  18%|█▊        | 11/60 [16:37<1:04:25, 78.88s/it] 

[I 2025-11-21 13:24:33,435] Trial 10 finished with value: 0.9769063726251668 and parameters: {'iterations': 1408, 'learning_rate': 0.027136277554984444, 'depth': 3, 'l2_leaf_reg': 68.37077190467626, 'bootstrap_type': 'Bernoulli', 'random_strength': 0.005726169739058915, 'leaf_estimation_iterations': 1, 'grow_policy': 'Depthwise', 'subsample': 0.9405851998845339}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  20%|██        | 12/60 [19:18<1:22:54, 103.64s/it]

[I 2025-11-21 13:27:13,706] Trial 11 finished with value: 0.9841299281591974 and parameters: {'iterations': 1988, 'learning_rate': 0.2206613734912918, 'depth': 9, 'l2_leaf_reg': 76.15156126929588, 'bootstrap_type': 'Bernoulli', 'random_strength': 3.505864964777076e-05, 'leaf_estimation_iterations': 4, 'grow_policy': 'Depthwise', 'subsample': 0.6530356886489423}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  22%|██▏       | 13/60 [19:56<1:05:43, 83.90s/it] 

[I 2025-11-21 13:27:52,164] Trial 12 finished with value: 0.9855683585759021 and parameters: {'iterations': 1545, 'learning_rate': 0.03971422937558073, 'depth': 5, 'l2_leaf_reg': 3.6707235887575784, 'bootstrap_type': 'Bernoulli', 'random_strength': 6.96112532729741e-07, 'leaf_estimation_iterations': 4, 'grow_policy': 'Depthwise', 'subsample': 0.7406442464830113}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  23%|██▎       | 14/60 [20:39<54:46, 71.45s/it]  

[I 2025-11-21 13:28:34,856] Trial 13 finished with value: 0.9854444067584897 and parameters: {'iterations': 1511, 'learning_rate': 0.027955577511346797, 'depth': 5, 'l2_leaf_reg': 1.1676606494590083, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.7693093957393308e-08, 'leaf_estimation_iterations': 5, 'grow_policy': 'Depthwise', 'subsample': 0.8223368426340427}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  25%|██▌       | 15/60 [20:46<39:01, 52.03s/it]

[I 2025-11-21 13:28:41,882] Trial 14 finished with value: 0.9824046625366663 and parameters: {'iterations': 322, 'learning_rate': 0.04439543983489987, 'depth': 6, 'l2_leaf_reg': 0.0012543787935539235, 'bootstrap_type': 'MVS', 'random_strength': 4.803228806453797e-07, 'leaf_estimation_iterations': 1, 'grow_policy': 'SymmetricTree'}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  27%|██▋       | 16/60 [21:28<36:00, 49.10s/it]

[I 2025-11-21 13:29:24,174] Trial 15 finished with value: 0.9834767069794748 and parameters: {'iterations': 1332, 'learning_rate': 0.00850439943104333, 'depth': 5, 'l2_leaf_reg': 3.182143753455071, 'bootstrap_type': 'Bernoulli', 'random_strength': 0.0002241063432607491, 'leaf_estimation_iterations': 3, 'grow_policy': 'Depthwise', 'subsample': 0.8018147353828683}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  28%|██▊       | 17/60 [22:22<36:16, 50.62s/it]

[I 2025-11-21 13:30:18,328] Trial 16 finished with value: 0.985445946553746 and parameters: {'iterations': 1618, 'learning_rate': 0.0675467187221607, 'depth': 6, 'l2_leaf_reg': 9.23510798359542, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.1524480998090995e-06, 'leaf_estimation_iterations': 5, 'grow_policy': 'Depthwise', 'subsample': 0.7057550447166807}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  30%|███       | 18/60 [22:44<29:15, 41.80s/it]

[I 2025-11-21 13:30:39,590] Trial 17 pruned. 


Best trial: 7. Best value: 0.985631:  32%|███▏      | 19/60 [25:02<48:27, 70.91s/it]

[I 2025-11-21 13:32:58,326] Trial 18 finished with value: 0.9853657733263871 and parameters: {'iterations': 1500, 'learning_rate': 0.014157749726535995, 'depth': 9, 'l2_leaf_reg': 0.3494339961622941, 'bootstrap_type': 'Bernoulli', 'random_strength': 6.698871009073363e-08, 'leaf_estimation_iterations': 2, 'grow_policy': 'SymmetricTree', 'subsample': 0.8925815139053519}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  33%|███▎      | 20/60 [25:47<41:54, 62.87s/it]

[I 2025-11-21 13:33:42,457] Trial 19 finished with value: 0.9772721803581937 and parameters: {'iterations': 1273, 'learning_rate': 0.0024456590317430897, 'depth': 5, 'l2_leaf_reg': 9.893351718088447e-05, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.4590807765944266e-06, 'leaf_estimation_iterations': 6, 'grow_policy': 'Depthwise', 'subsample': 0.7491025122658856}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 7. Best value: 0.985631:  35%|███▌      | 21/60 [30:35<1:24:52, 130.58s/it]

[I 2025-11-21 13:38:30,890] Trial 20 finished with value: 0.9818618280621969 and parameters: {'iterations': 1071, 'learning_rate': 0.08476330633758122, 'depth': 12, 'l2_leaf_reg': 0.011708700771385227, 'bootstrap_type': 'Bernoulli', 'random_strength': 0.0002542411583008301, 'leaf_estimation_iterations': 4, 'grow_policy': 'Depthwise', 'subsample': 0.5717890721066277}. Best is trial 7 with value: 0.9856314038377455.


Best trial: 21. Best value: 0.98569:  37%|███▋      | 22/60 [31:37<1:09:44, 110.13s/it]

[I 2025-11-21 13:39:33,321] Trial 21 finished with value: 0.9856901965435482 and parameters: {'iterations': 1633, 'learning_rate': 0.07605130664806087, 'depth': 6, 'l2_leaf_reg': 15.723687325926248, 'bootstrap_type': 'Bernoulli', 'random_strength': 2.647973339582821e-06, 'leaf_estimation_iterations': 5, 'grow_policy': 'Depthwise', 'subsample': 0.7369741805748464}. Best is trial 21 with value: 0.9856901965435482.


Best trial: 21. Best value: 0.98569:  37%|███▋      | 22/60 [32:45<56:34, 89.34s/it]   

[W 2025-11-21 13:40:40,874] Trial 22 failed with parameters: {'iterations': 1552, 'learning_rate': 0.016314083354165232, 'depth': 6, 'l2_leaf_reg': 18.668039997526158, 'bootstrap_type': 'Bernoulli', 'random_strength': 2.5143262143155683e-06, 'leaf_estimation_iterations': 2, 'grow_policy': 'Depthwise', 'subsample': 0.8132142570632755} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/Users/winston/Documents/School/Y3S1/BT4012/Group Project/.venv/lib/python3.13/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/tg/0n0rjzh11yd7zhlhtlf2g84r0000gn/T/ipykernel_5131/2823602544.py", line 59, in objective
    model.fit(
    ~~~~~~~~~^
        X_train, y_train,
        ^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        verbose=False
        ^^^^^^^^^^^^^
    )
    ^
  File "/Users/winston/Documents/School/Y3S1/BT4012/Group Project/.venv/lib/python3.13/site-packages/catboost/core.py",




KeyboardInterrupt: 

In [None]:
best_params = study.best_params
best_params['random_seed'] = SEED
best_params['verbose'] = 0
best_params['task_type'] = 'CPU'

# Derive TF-IDF and SVD best params from Optuna study
best_tfidf_max = best_params.get('max_features', 5000)
best_ngram_min = best_params.get('ngram_min', 3)
best_ngram_max = best_params.get('ngram_max', 5)
best_ngram_range = (best_ngram_min, best_ngram_max)

best_n_svd_components = best_params.get('n_svd_components', 100)

# Filter out text processing params so we only pass model params to CatBoost
model_param_keys = [k for k in best_params.keys() if k not in ['max_features', 'ngram_min', 'ngram_max', 'n_svd_components']]
catboost_model_params = {k: best_params[k] for k in model_param_keys}

optuna_info = {
    "n_trials": 60,
    "best_params": study.best_params,
    "best_value": study.best_value,
    "study_path": "optuna_study.pkl",
    "tfidf_max_features": best_tfidf_max,
    "tfidf_ngram_range": best_ngram_range,
    "n_svd_components": best_n_svd_components
}

print("Running final experiment with best parameters...")
run_tree_experiment(
    CatBoostClassifier, 
    "CatBoost_Optuna", 
    catboost_model_params, 
    "exp_2_catboost_optuna", 
    X_train=X_train_df, 
    y_train=y_train, 
    X_test=X_test_df, 
    numeric_features=non_text_cols, 
    text_feature='url', 
    save_model=SAVE_MODELS,
    optuna_study=study,
    optuna_params=optuna_info,
    n_svd_components=best_n_svd_components,
    tfidf_max_features=best_tfidf_max,
    tfidf_ngram_range=best_ngram_range
)
