# Phishing URL Tree-Based Model Experiments

This notebook explores various tree-based models using the Kaggle phishing URL dataset.

In increasing order of complexity, we will experiment with:

1. Random Forest
2. XGBoost
3. LightGBM
4. CatBoost

## Setup and Imports

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report)
from sklearn.ensemble import RandomForestClassifier

# Tree-based models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Import ModelSaver
import sys
import os
sys.path.append(os.path.abspath('.'))
from save_model import ModelSaver

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Configuration
SAVE_MODELS = True
SEED = 42
np.random.seed(SEED)

# Check for Google Drive (if running in Colab)
use_drive = False
try:
    from google.colab import drive
    drive.mount('/content/drive')
    use_drive = True
    drive_root = '/content/drive/MyDrive/fraud-grp-proj/'
except ImportError:
    pass

In [None]:
# Load train and test datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

print(f"Train with features shape: {train_w_features_df.shape}")
print(f"Test with features shape: {test_w_features_df.shape}")

Due to the robust nature of tree-based models, we will just be using the full feature set including originals and transformed features, unlike our approach for linear and neural network models.

In [None]:
# Drop original versions of log transformed features
train_w_features_df.drop(columns=['length_url', 'length_path',  'ratio_hostname_url', 'length_words_url', 'avg_word_hostname', 'num_unique_chars_hostname'], inplace=True)

# Drop original versions of squared transformed features
train_w_features_df.drop(columns=['ratio_letter_url', 'entropy_hostname'], inplace=True)

# Drop original versions of is_zero transformed features
train_w_features_df.drop(columns=['num_hyphens_domain', 'length_subdomains', 'num_hyphens',  'num_at', 'num_question_marks', 'num_and', 'num_equal', 'num_percent', 'ratio_digits_url', 'ratio_digits_hostname', 'avg_word_path', 'length_query'], inplace=True)

# Drop original versions of bucketed transformed features
train_w_features_df.drop(columns=['num_subdomain', 'length_tld', 'path_depth'], inplace=True)

# Prepare X and y
numeric_cols = train_w_features_df.select_dtypes(include=[np.number]).columns.tolist()
if 'target' in numeric_cols:
    numeric_cols.remove('target')

X = train_w_features_df[numeric_cols].values
y = train_w_features_df['target'].values

X_test = test_w_features_df[numeric_cols].values
if 'target' in test_w_features_df.columns:
    y_test = test_w_features_df['target'].values
else:
    y_test = np.zeros(len(test_w_features_df))

print(f"Features used: {len(numeric_cols)}")

## Training Models

Now lets move on to training the models. We use the `ModelSaver` utility to help us standardize the storing of metrics and models for evaluation later on.

In [None]:
def run_tree_experiment(model_class, model_name, model_params, experiment_name, save_model=True):
    print(f"\n=== Running Experiment: {experiment_name} ({model_name}) ===")
    print(f"Saving Model: {save_model}")

    saver = None
    if save_model:
        if use_drive:
            base_path = drive_root + "experiments"
        else:
            base_path = "experiments"
        saver = ModelSaver(base_path=base_path)
        saver.start_experiment(
            experiment_name=experiment_name,
            model_type=model_name,
            vectorizer="None (Numeric Features)",
            vectorizer_params={},
            model_params=model_params,
            n_folds=5,
            save_format="pickle"
        )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    fold_test_preds = []
    
    # Store feature names for importance analysis
    feature_names = numeric_cols

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        print(f"\n--- Fold {fold}/5 ---")
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Initialize and train model
        model = model_class(**model_params)
        model.fit(X_train, y_train)

        # Validation predictions
        val_probs = model.predict_proba(X_val)[:, 1]
        val_preds = (val_probs > 0.5).astype(int)

        # Calculate metrics
        tn, fp, fn, tp = confusion_matrix(y_val, val_preds).ravel()
        
        metrics = {
            'fold': fold,
            'accuracy': accuracy_score(y_val, val_preds),
            'precision': precision_score(y_val, val_preds, zero_division=0),
            'recall': recall_score(y_val, val_preds, zero_division=0),
            'f1': f1_score(y_val, val_preds, zero_division=0),
            'roc_auc': roc_auc_score(y_val, val_probs),
            'TP': int(tp),
            'FP': int(fp),
            'TN': int(tn),
            'FN': int(fn),
            'train_size': len(train_idx),
            'val_size': len(val_idx)
        }
        
        print(f"Fold {fold} Val AUC: {metrics['roc_auc']:.4f}")

        # Test predictions
        test_probs = model.predict_proba(X_test)[:, 1]
        fold_test_preds.append(test_probs)

        if save_model and saver:
            saver.add_fold(
                fold_model=model,
                fold_metric=metrics,
                test_predictions=test_probs,
                feature_names=feature_names
            )

    if save_model and saver:
        saver.finalize_experiment()
        print(f"Experiment saved to {saver._exp_dir}")

    return model

### 1. Random Forest

We use the hyperparameters identified in previous experiments.

In [None]:
rf_params = {
    'n_estimators': 200,
    'max_depth': 20,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'random_state': SEED,
    'n_jobs': -1,
    'verbose': 0
}

run_tree_experiment(RandomForestClassifier, "RandomForest", rf_params, "exp_2_random_forest", save_model=SAVE_MODELS)

### 2. XGBoost

We use the optimized hyperparameters from the legacy model.

In [None]:
xgb_params = {
    'n_estimators': 400,
    'learning_rate': 0.01818965322291987,
    'max_depth': 12,
    'subsample': 0.9089044013517584,
    'colsample_bytree': 0.5968303772495912,
    'reg_alpha': 0.34341485605720035,
    'reg_lambda': 1.7747160863049662,
    'min_child_weight': 4,
    'gamma': 0.5508393571724655,
    'random_state': SEED,
    'eval_metric': "logloss",
    'n_jobs': -1
}

run_tree_experiment(XGBClassifier, "XGBoost", xgb_params, "exp_2_xgboost", save_model=SAVE_MODELS)

### 3. LightGBM

We use the optimized hyperparameters for LightGBM.

In [None]:
lgbm_params = {
    'n_estimators': 300,
    'learning_rate': 0.05,
    'max_depth': 10,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_samples': 20,
    'random_state': SEED,
    'n_jobs': -1,
    'verbose': -1
}

run_tree_experiment(LGBMClassifier, "LightGBM", lgbm_params, "exp_2_lightgbm", save_model=SAVE_MODELS)

### 4. CatBoost

We use the optimized hyperparameters for CatBoost.

In [None]:
catboost_params = {
    'iterations': 500,
    'learning_rate': 0.05,
    'depth': 8,
    'l2_leaf_reg': 3,
    'subsample': 0.8,
    'random_seed': SEED,
    'verbose': 0,
    'eval_metric': 'AUC',
    'task_type': 'CPU'
}

run_tree_experiment(CatBoostClassifier, "CatBoost", catboost_params, "exp_2_catboost", save_model=SAVE_MODELS)