# Phishing URL Linear Model Experiments

This notebook explores various linear models using the Kaggle phishing URL dataset.

In increasing order of complexity, we will experiment with:

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve)
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.calibration import CalibratedClassifierCV

# Optuna
import optuna

# Set seed for reproducibility
np.random.seed(42)

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Load train and test datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

print(f"Train with features shape: {train_w_features_df.shape}")
print(f"Test with features shape: {test_w_features_df.shape}")

In [None]:
train_w_features_df.columns

Following the EDA, we use the transformed features and drop the original ones since linear models require normalized and scaled inputs.

In [None]:
# Drop original versions of log transformed features
train_w_features_df.drop(columns=['length_url', 'length_path',  'ratio_hostname_url', 'length_words_url', 'avg_word_hostname', 'num_unique_chars_hostname'], inplace=True)

# Drop original versions of squared transformed features
train_w_features_df.drop(columns=['ratio_letter_url', 'entropy_hostname'], inplace=True)

# Drop original versions of is_zero transformed features
train_w_features_df.drop(columns=['num_hyphens_domain', 'length_subdomains', 'num_hyphens',  'num_at', 'num_question_marks', 'num_and', 'num_equal', 'num_percent', 'ratio_digits_url', 'ratio_digits_hostname', 'avg_word_path', 'length_query'], inplace=True)

# Drop original versions of bucketed transformed features
train_w_features_df.drop(columns=['num_subdomain', 'length_tld', 'path_depth'], inplace=True)

# Check final columns
train_w_features_df.columns

## Training Models

Now lets move on to training the models. We use the saver class to help us standardize the storing of metrics and models for evaluation later on.

In [None]:
# Import ModelSaver
import sys
import os
sys.path.append(os.path.abspath('.'))
from save_model import ModelSaver

# Configuration
SAVE_MODELS = True
N_FOLDS = 5
RANDOM_STATE = 42

# Check device (not strictly needed for sklearn but good for consistency)
print(f"Running on: {sys.platform}")

In [None]:
# --- Data Preparation ---

# 1. Prepare Numeric Features
# Select numeric and boolean columns and exclude target
numeric_cols = train_w_features_df.select_dtypes(include=[np.number, bool]).columns.tolist()
if 'target' in numeric_cols:
    numeric_cols.remove('target')

print(f"Selected {len(numeric_cols)} numeric/boolean features.")

# Ensure boolean columns are converted to integers (0/1) for the model
X_numeric = train_w_features_df[numeric_cols].astype(float).values
y = train_w_features_df['target'].values

# Prepare Test Data for Numeric
X_numeric_test = test_w_features_df[numeric_cols].astype(float).values

# 2. Prepare Text Features (URLs)
X_text = train_df['url'].values
X_text_test = test_df['url'].values

# Check shapes
print(f"Numeric Train Shape: {X_numeric.shape}")
print(f"Numeric Test Shape: {X_numeric_test.shape}")
print(f"Text Train Shape: {X_text.shape}")
print(f"Text Test Shape: {X_text_test.shape}")
print(f"Target Shape: {y.shape}")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def calculate_metrics(y_true, y_pred_proba, threshold=0.5):
    """Calculate standard metrics for binary classification."""
    y_pred = (y_pred_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_pred_proba),
        'TP': int(tp), 'FP': int(fp), 'TN': int(tn), 'FN': int(fn)
    }

def run_cv_experiment(X, y, X_test, pipeline_creator, experiment_name, model_name, vectorizer_name, params, feature_names_func=None):
    """
    Run a cross-validation experiment and save results using ModelSaver.
    
    Args:
        X: Training features
        y: Training targets
        X_test: Test features
        pipeline_creator: Function that returns a fresh sklearn Pipeline
        experiment_name: Name of the experiment for saving
        model_name: Name of the model type
        vectorizer_name: Name of the vectorizer/feature set
        params: Dictionary containing 'model_params' and 'vectorizer_params'
        feature_names_func: Optional function to extract feature names from fitted pipeline
    """
    print(f"\n=== Running Experiment: {experiment_name} ===")
    
    saver = ModelSaver(base_path="experiments")
    saver.start_experiment(
        experiment_name=experiment_name,
        model_type=model_name,
        vectorizer=vectorizer_name,
        vectorizer_params=params.get('vectorizer_params', {}),
        model_params=params.get('model_params', {}),
        n_folds=N_FOLDS
    )
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        print(f"Fold {fold}/{N_FOLDS}")
        
        # Split data
                # Split data
        if hasattr(X, "iloc"): # Check if DataFrame
             X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        else:
             X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Create and fit pipeline
        pipeline = pipeline_creator()
        pipeline.fit(X_train_fold, y_train_fold)
        
        # Validation metrics
        val_probs = pipeline.predict_proba(X_val_fold)[:, 1]
        val_metrics = calculate_metrics(y_val_fold, val_probs)
        val_metrics['fold'] = fold
        
        print(f"  Val AUC: {val_metrics['roc_auc']:.4f}")
        
        # Test predictions (for ensemble later)
        test_probs = pipeline.predict_proba(X_test)[:, 1]
        
        # Get feature names if possible
        feature_names = None
        if feature_names_func:
            try:
                feature_names = feature_names_func(pipeline)
            except Exception as e:
                print(f"  Could not extract feature names: {e}")
            
        saver.add_fold(
            fold_model=pipeline,
            fold_metric=val_metrics,
            test_predictions=test_probs,
            feature_names=feature_names
        )
        
    saver.finalize_experiment()
    print(f"Experiment saved to {saver._exp_dir}")

### 1. Logistic Regression (Engineered Numeric Features)

We first test a simple Logistic Regression model using only the manually engineered numeric features.


In [None]:
def create_numeric_lr_pipeline():
    return Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, solver='lbfgs'))
    ])

def get_numeric_feature_names(pipeline):
    return numeric_cols

numeric_params = {
    'model_params': {'max_iter': 1000, 'solver': 'lbfgs'},
    'vectorizer_params': {'type': 'StandardScaler'}
}

run_cv_experiment(
    X=X_numeric, 
    y=y, 
    X_test=X_numeric_test,
    pipeline_creator=create_numeric_lr_pipeline,
    experiment_name="exp_1_numeric_lr",
    model_name="LogisticRegression",
    vectorizer_name="NumericFeatures",
    params=numeric_params,
    feature_names_func=get_numeric_feature_names
)

### 2. Logistic Regression (TF-IDF Features)

Next, we test Logistic Regression using TF-IDF features extracted directly from the URL strings. We use character n-grams to capture patterns in the URL structure.


In [None]:
def create_tfidf_lr_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, analyzer='char', ngram_range=(3, 5))),
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
    ])

def get_tfidf_feature_names(pipeline):
    return pipeline.named_steps['tfidf'].get_feature_names_out().tolist()

tfidf_params = {
    'model_params': {'max_iter': 1000},
    'vectorizer_params': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}
}

run_cv_experiment(
    X=X_text, 
    y=y, 
    X_test=X_text_test,
    pipeline_creator=create_tfidf_lr_pipeline,
    experiment_name="exp_1_tfidf_lr",
    model_name="LogisticRegression",
    vectorizer_name="TfidfVectorizer",
    params=tfidf_params,
    feature_names_func=get_tfidf_feature_names
)

### 3. Logistic Regression (Combined Features)

Since we see that tf-idf features perform better, lets try combining both feature sets.

In [None]:
# Create combined DataFrame with both text and numeric features
X_combined_df = train_w_features_df[numeric_cols].copy()
X_combined_df['url'] = train_df['url']

X_combined_test_df = test_w_features_df[numeric_cols].copy()
X_combined_test_df['url'] = test_df['url']

# Define the preprocessor
# Note: TfidfVectorizer expects a 1D array, so we specify the column name 'url'
# but we might need a custom transformer or ensure ColumnTransformer passes it correctly.
# ColumnTransformer passes the column as a Series (which is array-like) to TfidfVectorizer.
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(max_features=5000, analyzer='char', ngram_range=(3, 5)), 'url'),
        ('scaler', StandardScaler(), numeric_cols)
    ],
    remainder='drop' # Drop any other columns if present
)

def create_combined_lr_pipeline():
    return Pipeline([
        ('preprocessor', preprocessor),
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
    ])

def get_combined_feature_names(pipeline):
    # Extract feature names from the preprocessor
    tfidf_features = pipeline.named_steps['preprocessor'].named_transformers_['tfidf'].get_feature_names_out().tolist()
    # Numeric features are passed through, so their names are preserved
    return tfidf_features + numeric_cols

combined_params = {
    'model_params': {'max_iter': 1000},
    'vectorizer_params': {'tfidf': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}, 'scaler': 'StandardScaler'}
}

run_cv_experiment(
    X=X_combined_df, 
    y=y, 
    X_test=X_combined_test_df,
    pipeline_creator=create_combined_lr_pipeline,
    experiment_name="exp_1_combined_lr",
    model_name="LogisticRegression",
    vectorizer_name="CombinedFeatures",
    params=combined_params,
    feature_names_func=get_combined_feature_names
)

### 4. SVM (Combined Features)

Our combined features seem to perform better overall, telling us that both feature sets contribute useful information. Lets try using SVM to see if accuracy improves further.

In [None]:
def create_combined_svm_pipeline():
    return Pipeline([
        ('preprocessor', preprocessor),
        ('clf', SVC(kernel='linear', C=1.0, random_state=RANDOM_STATE, probability=True))
    ])

combined_svm_params = {
    'model_params': {'kernel': 'linear', 'C': 1.0, 'probability': True},
    'vectorizer_params': {'tfidf': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}, 'scaler': 'StandardScaler'}
}

run_cv_experiment(
    X=X_combined_df, 
    y=y, 
    X_test=X_combined_test_df,
    pipeline_creator=create_combined_svm_pipeline,
    experiment_name="exp_1_combined_svm",
    model_name="SVM",
    vectorizer_name="CombinedFeatures",
    params=combined_svm_params,
    feature_names_func=get_combined_feature_names
)

### 5. Optuna Hyperparameter Tuning

We will use Optuna to tune the hyperparameters of our best model (SVM) to see if we can improve performance further.

In [None]:
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.exceptions import TrialPruned

sampler = TPESampler(seed=RANDOM_STATE, multivariate=True, group=True)
pruner = MedianPruner(n_startup_trials=10, n_warmup_steps=0)

def objective(trial):
    """Optuna objective to tune a calibrated LinearSVC on combined features."""
    C = trial.suggest_float('C', 1e-4, 1e3, log=True)
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    loss = trial.suggest_categorical('loss', ['hinge', 'squared_hinge'])
    if loss == 'hinge':
        dual = True
    else:
        dual = trial.suggest_categorical('dual', [True, False])
    tol = trial.suggest_float('tol', 1e-5, 1e-2, log=True)
    max_iter = trial.suggest_int('max_iter', 2000, 20000)
    calibration_method = trial.suggest_categorical('calibration_method', ['sigmoid', 'isotonic'])

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    fold_aucs = []

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_combined_df, y)):
        X_train_fold, X_val_fold = X_combined_df.iloc[train_idx], X_combined_df.iloc[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        base_estimator = LinearSVC(
            C=C,
            class_weight=class_weight,
            loss=loss,
            dual=dual,
            tol=tol,
            max_iter=max_iter,
            random_state=RANDOM_STATE
        )
        
        calibrated_clf = CalibratedClassifierCV(
            estimator=base_estimator,
            cv=3,
            method=calibration_method,
            n_jobs=-1
        )
        
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', calibrated_clf)
        ])

        try:
            model.fit(X_train_fold, y_train_fold)
            val_probs = model.predict_proba(X_val_fold)[:, 1]
            fold_auc = roc_auc_score(y_val_fold, val_probs)
        except ValueError as exc:
            raise TrialPruned() from exc

        fold_aucs.append(fold_auc)
        trial.report(fold_auc, step=fold_idx)

        if trial.should_prune():
            raise TrialPruned()

    mean_auc = float(np.mean(fold_aucs))
    return mean_auc

In [None]:
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=60, show_progress_bar=True)

print('Number of finished trials:', len(study.trials))
print('Best AUC:', study.best_value)
print('Best params:', study.best_params)

In [None]:
best_params = study.best_params.copy()
best_params['random_state'] = RANDOM_STATE
best_params['dual'] = best_params.get('dual', True if best_params['loss'] == 'hinge' else True)
best_params['max_iter'] = int(best_params['max_iter'])
calibration_method = best_params.pop('calibration_method')

def create_tuned_linear_svc_pipeline():
    base_estimator = LinearSVC(
        C=best_params['C'],
        class_weight=best_params['class_weight'],
        loss=best_params['loss'],
        dual=best_params['dual'],
        tol=best_params['tol'],
        max_iter=best_params['max_iter'],
        random_state=best_params['random_state']
    )
    calibrated_clf = CalibratedClassifierCV(
        estimator=base_estimator,
        cv=3,
        method=calibration_method,
        n_jobs=-1
    )
    return Pipeline([
        ('preprocessor', preprocessor),
        ('clf', calibrated_clf)
    ])

tuned_linear_svc_params = {
    'model_params': {**best_params, 'calibration_method': calibration_method, 'calibration_cv': 3},
    'vectorizer_params': {'tfidf': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}, 'scaler': 'StandardScaler'}
}

print("Running final experiment with tuned LinearSVC parameters...")
run_cv_experiment(
    X=X_combined_df, 
    y=y, 
    X_test=X_combined_test_df,
    pipeline_creator=create_tuned_linear_svc_pipeline,
    experiment_name="exp_1_combined_linear_svc_optuna",
    model_name="LinearSVC",
    vectorizer_name="CombinedFeatures",
    params=tuned_linear_svc_params,
    feature_names_func=get_combined_feature_names
)