# Phishing URL Tree-Based Model Experiments

This notebook explores various tree-based models using the Kaggle phishing URL dataset.

For the tree-based models, we will be experimenting with:

1. Random Forest
2. XGBoost
3. LightGBM
4. CatBoost

## Setup and Imports

In [1]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report)
from sklearn.ensemble import RandomForestClassifier

# Tree-based models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Import ModelSaver
import sys
import os
sys.path.append(os.path.abspath('.'))
from save_model import ModelSaver

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [2]:
# Configuration
SAVE_MODELS = True
SEED = 42
np.random.seed(SEED)

# Check for Google Drive (if running in Colab)
use_drive = False
try:
    from google.colab import drive
    drive.mount('/content/drive')
    use_drive = True
    drive_root = '/content/drive/MyDrive/fraud-grp-proj/'
except ImportError:
    pass

In [3]:
# Load train and test datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

print(f"Train with features shape: {train_w_features_df.shape}")
print(f"Test with features shape: {test_w_features_df.shape}")

Train shape: (9143, 2)
Test shape: (2286, 2)
Train with features shape: (9143, 78)
Test with features shape: (2286, 78)


Due to the robust nature of tree-based models, we will just be using the full feature set including originals and transformed features, unlike our approach for linear and neural network models.

In [4]:
# Prepare X and y
non_text_cols = train_w_features_df.select_dtypes(exclude=[object]).columns.tolist()
if 'target' in non_text_cols:
    non_text_cols.remove('target')

X = train_w_features_df[non_text_cols].values
y = train_w_features_df['target'].values

X_test = test_w_features_df[non_text_cols].values
if 'target' in test_w_features_df.columns:
    y_test = test_w_features_df['target'].values
else:
    y_test = np.zeros(len(test_w_features_df))

print(f"Features used: {len(non_text_cols)}")

Features used: 72


## Training Models

Now lets move on to training the models. We use the `ModelSaver` utility to help us standardize the storing of metrics and models for evaluation later on.

In [5]:
def run_tree_experiment(model_class, model_name, model_params, experiment_name, save_model=True):
    print(f"\n=== Running Experiment: {experiment_name} ({model_name}) ===")
    print(f"Saving Model: {save_model}")

    saver = None
    if save_model:
        if use_drive:
            base_path = drive_root + "experiments"
        else:
            base_path = "experiments"
        saver = ModelSaver(base_path=base_path)
        saver.start_experiment(
            experiment_name=experiment_name,
            model_type=model_name,
            vectorizer="None (Numeric Features)",
            vectorizer_params={},
            model_params=model_params,
            n_folds=5,
            save_format="pickle"
        )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    fold_test_preds = []
    
    # Store feature names for importance analysis
    feature_names = non_text_cols

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        print(f"\n--- Fold {fold}/5 ---")
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Initialize and train model
        model = model_class(**model_params)
        model.fit(X_train, y_train)

        # Validation predictions
        val_probs = model.predict_proba(X_val)[:, 1]
        val_preds = (val_probs > 0.5).astype(int)

        # Calculate metrics
        tn, fp, fn, tp = confusion_matrix(y_val, val_preds).ravel()
        
        metrics = {
            'fold': fold,
            'accuracy': accuracy_score(y_val, val_preds),
            'precision': precision_score(y_val, val_preds, zero_division=0),
            'recall': recall_score(y_val, val_preds, zero_division=0),
            'f1': f1_score(y_val, val_preds, zero_division=0),
            'roc_auc': roc_auc_score(y_val, val_probs),
            'TP': int(tp),
            'FP': int(fp),
            'TN': int(tn),
            'FN': int(fn),
            'train_size': len(train_idx),
            'val_size': len(val_idx)
        }
        
        print(f"Fold {fold} Val AUC: {metrics['roc_auc']:.4f}")

        # Test predictions
        test_probs = model.predict_proba(X_test)[:, 1]
        fold_test_preds.append(test_probs)

        if save_model and saver:
            saver.add_fold(
                fold_model=model,
                fold_metric=metrics,
                test_predictions=test_probs,
                feature_names=feature_names
            )

    if save_model and saver:
        saver.finalize_experiment()
        print(f"Experiment saved to {saver._exp_dir}")

    return model

### 1. Random Forest

In [14]:
# Default params
rf_params = {
    'random_state': SEED,
    'verbose': 0
}

run_tree_experiment(RandomForestClassifier, "RandomForest", rf_params, "exp_2_random_forest", save_model=SAVE_MODELS)


=== Running Experiment: exp_2_random_forest (RandomForest) ===
Saving Model: True
Experiment 'exp_2_random_forest' initialized at: experiments/exp_2_random_forest
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9739
  Fold 1/5 saved | ROC AUC: 0.9739

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9716
  Fold 2/5 saved | ROC AUC: 0.9716

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9705
  Fold 3/5 saved | ROC AUC: 0.9705

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9700
  Fold 4/5 saved | ROC AUC: 0.9700

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9707
  Fold 5/5 saved | ROC AUC: 0.9707

Finalizing experiment...
  Predictions saved to experiments/exp_2_random_forest/exp_2_random_forest_prediction.csv

✓ Experiment 'exp_2_random_forest' finalized!
  Location: experiments/exp_2_random_forest
  Folds completed: 5
  Best fold: 1 (ROC AUC: 0.9739)
  Average ROC AUC: 0.9713 ± 0.0014
Experiment saved to experiments/exp_2_random_forest


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### 2. XGBoost

In [15]:
# Default params
xgb_params = {
    'random_state': SEED,
}

run_tree_experiment(XGBClassifier, "XGBoost", xgb_params, "exp_2_xgboost", save_model=SAVE_MODELS)


=== Running Experiment: exp_2_xgboost (XGBoost) ===
Saving Model: True
Experiment 'exp_2_xgboost' initialized at: experiments/exp_2_xgboost
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9755
  Fold 1/5 saved | ROC AUC: 0.9755

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9746
  Fold 2/5 saved | ROC AUC: 0.9746

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9745
  Fold 3/5 saved | ROC AUC: 0.9745

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9729
  Fold 4/5 saved | ROC AUC: 0.9729

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9737
  Fold 5/5 saved | ROC AUC: 0.9737

Finalizing experiment...
  Predictions saved to experiments/exp_2_xgboost/exp_2_xgboost_prediction.csv

✓ Experiment 'exp_2_xgboost' finalized!
  Location: experiments/exp_2_xgboost
  Folds completed: 5
  Best fold: 1 (ROC AUC: 0.9755)
  Average ROC AUC: 0.9742 ± 0.0009
Experiment saved to experiments/exp_2_xgboost


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


### 3. LightGBM

In [16]:
# Default params
lgbm_params = {
    'random_state': SEED,
    'verbose': -1
}

run_tree_experiment(LGBMClassifier, "LightGBM", lgbm_params, "exp_2_lightgbm", save_model=SAVE_MODELS)


=== Running Experiment: exp_2_lightgbm (LightGBM) ===
Saving Model: True
Experiment 'exp_2_lightgbm' initialized at: experiments/exp_2_lightgbm
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9747
  Fold 1/5 saved | ROC AUC: 0.9747

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9753
  Fold 2/5 saved | ROC AUC: 0.9753

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9726
  Fold 3/5 saved | ROC AUC: 0.9726

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9721
  Fold 4/5 saved | ROC AUC: 0.9721

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9719
  Fold 5/5 saved | ROC AUC: 0.9719

Finalizing experiment...
  Predictions saved to experiments/exp_2_lightgbm/exp_2_lightgbm_prediction.csv

✓ Experiment 'exp_2_lightgbm' finalized!
  Location: experiments/exp_2_lightgbm
  Folds completed: 5
  Best fold: 2 (ROC AUC: 0.9753)
  Average ROC AUC: 0.9733 ± 0.0014
Experiment saved to experiments/exp_2_lightgbm


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


### 4. CatBoost

In [17]:
# Default params
catboost_params = {
    'random_seed': SEED,
    'verbose': 0,
    'task_type': 'CPU'
}

run_tree_experiment(CatBoostClassifier, "CatBoost", catboost_params, "exp_2_catboost", save_model=SAVE_MODELS)


=== Running Experiment: exp_2_catboost (CatBoost) ===
Saving Model: True
Experiment 'exp_2_catboost' initialized at: experiments/exp_2_catboost
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---
Fold 1 Val AUC: 0.9774
  Fold 1/5 saved | ROC AUC: 0.9774

--- Fold 2/5 ---
Fold 2 Val AUC: 0.9767
  Fold 2/5 saved | ROC AUC: 0.9767

--- Fold 3/5 ---
Fold 3 Val AUC: 0.9778
  Fold 3/5 saved | ROC AUC: 0.9778

--- Fold 4/5 ---
Fold 4 Val AUC: 0.9785
  Fold 4/5 saved | ROC AUC: 0.9785

--- Fold 5/5 ---
Fold 5 Val AUC: 0.9758
  Fold 5/5 saved | ROC AUC: 0.9758

Finalizing experiment...
  Predictions saved to experiments/exp_2_catboost/exp_2_catboost_prediction.csv

✓ Experiment 'exp_2_catboost' finalized!
  Location: experiments/exp_2_catboost
  Folds completed: 5
  Best fold: 4 (ROC AUC: 0.9785)
  Average ROC AUC: 0.9773 ± 0.0009
Experiment saved to experiments/exp_2_catboost


<catboost.core.CatBoostClassifier at 0x151497b10>