# Phishing URL Linear Model Experiments

This notebook explores various linear models using the Kaggle phishing URL dataset.

In increasing order of complexity, we will experiment with:

In [37]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve)
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Set seed for reproducibility
np.random.seed(42)

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [11]:
# Load train and test datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

print(f"Train with features shape: {train_w_features_df.shape}")
print(f"Test with features shape: {test_w_features_df.shape}")

Train shape: (9143, 2)
Test shape: (2286, 2)
Train with features shape: (9143, 78)
Test with features shape: (2286, 78)


In [12]:
train_w_features_df.columns

Index(['url', 'target', 'is_http', 'has_subdomain', 'has_tld', 'num_subdomain',
       'is_domain_ip', 'num_hyphens_domain', 'is_punycode', 'has_path',
       'path_depth', 'has_filename', 'has_file_extension', 'has_query',
       'length_url', 'length_hostname', 'length_tld', 'length_sld',
       'length_subdomains', 'length_path', 'length_query', 'num_dots',
       'num_hyphens', 'num_at', 'num_question_marks', 'num_and', 'num_equal',
       'num_percent', 'tld_in_path', 'tld_in_subdomain',
       'subdomain_longer_sld', 'ratio_digits_url', 'ratio_digits_hostname',
       'ratio_letter_url', 'ratio_path_url', 'ratio_hostname_url',
       'length_words_url', 'avg_word_hostname', 'avg_word_path',
       'num_unique_chars_hostname', 'has_shortened_hostname',
       'entropy_hostname', 'has_www_subdomain', 'has_com_tld',
       'is_http_and_many_subdomains', 'ip_and_short_tld',
       'http_and_missing_domain_info', 'subdomain_depth_x_http', 'ip_x_http',
       'domain_complexity_score',

Following the EDA, we use the transformed features and drop the original ones since linear models require normalized and scaled inputs.

In [13]:
# Drop original versions of log transformed features
train_w_features_df.drop(columns=['length_url', 'length_path',  'ratio_hostname_url', 'length_words_url', 'avg_word_hostname', 'num_unique_chars_hostname'], inplace=True)

# Drop original versions of squared transformed features
train_w_features_df.drop(columns=['ratio_letter_url', 'entropy_hostname'], inplace=True)

# Drop original versions of is_zero transformed features
train_w_features_df.drop(columns=['num_hyphens_domain', 'length_subdomains', 'num_hyphens',  'num_at', 'num_question_marks', 'num_and', 'num_equal', 'num_percent', 'ratio_digits_url', 'ratio_digits_hostname', 'avg_word_path', 'length_query'], inplace=True)

# Drop original versions of bucketed transformed features
train_w_features_df.drop(columns=['num_subdomain', 'length_tld', 'path_depth'], inplace=True)

# Check final columns
train_w_features_df.columns

Index(['url', 'target', 'is_http', 'has_subdomain', 'has_tld', 'is_domain_ip',
       'is_punycode', 'has_path', 'has_filename', 'has_file_extension',
       'has_query', 'length_hostname', 'length_sld', 'num_dots', 'tld_in_path',
       'tld_in_subdomain', 'subdomain_longer_sld', 'ratio_path_url',
       'has_shortened_hostname', 'has_www_subdomain', 'has_com_tld',
       'is_http_and_many_subdomains', 'ip_and_short_tld',
       'http_and_missing_domain_info', 'subdomain_depth_x_http', 'ip_x_http',
       'domain_complexity_score', 'suspicion_score', 'contains_brand_misspell',
       'is_homoglyph_attack', 'homoglyph_type', 'risk_score',
       'is_zero_num_hyphens_domain', 'is_zero_length_subdomains',
       'is_zero_num_hyphens', 'is_zero_num_at', 'is_zero_num_question_marks',
       'is_zero_num_and', 'is_zero_num_equal', 'is_zero_num_percent',
       'is_zero_ratio_digits_url', 'is_zero_ratio_digits_hostname',
       'is_zero_avg_word_path', 'is_zero_length_query',
       'num_sub

## Training Models

Now lets move on to training the models. We use the saver class to help us standardize the storing of metrics and models for evaluation later on.

In [14]:
# Import ModelSaver
import sys
import os
sys.path.append(os.path.abspath('.'))
from save_model import ModelSaver

# Configuration
SAVE_MODELS = True
N_FOLDS = 5
RANDOM_STATE = 42

# Check device (not strictly needed for sklearn but good for consistency)
print(f"Running on: {sys.platform}")

Running on: darwin


In [15]:
# --- Data Preparation ---

# 1. Prepare Numeric Features
# Select numeric and boolean columns and exclude target
numeric_cols = train_w_features_df.select_dtypes(include=[np.number, bool]).columns.tolist()
if 'target' in numeric_cols:
    numeric_cols.remove('target')

print(f"Selected {len(numeric_cols)} numeric/boolean features.")

# Ensure boolean columns are converted to integers (0/1) for the model
X_numeric = train_w_features_df[numeric_cols].astype(float).values
y = train_w_features_df['target'].values

# Prepare Test Data for Numeric
X_numeric_test = test_w_features_df[numeric_cols].astype(float).values

# 2. Prepare Text Features (URLs)
X_text = train_df['url'].values
X_text_test = test_df['url'].values

# Check shapes
print(f"Numeric Train Shape: {X_numeric.shape}")
print(f"Numeric Test Shape: {X_numeric_test.shape}")
print(f"Text Train Shape: {X_text.shape}")
print(f"Text Test Shape: {X_text_test.shape}")
print(f"Target Shape: {y.shape}")

Selected 49 numeric/boolean features.
Numeric Train Shape: (9143, 49)
Numeric Test Shape: (2286, 49)
Text Train Shape: (9143,)
Text Test Shape: (2286,)
Target Shape: (9143,)


In [16]:
from sklearn.pipeline import Pipeline

def calculate_metrics(y_true, y_pred_proba, threshold=0.5):
    """Calculate standard metrics for binary classification."""
    y_pred = (y_pred_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_pred_proba),
        'TP': int(tp), 'FP': int(fp), 'TN': int(tn), 'FN': int(fn)
    }

def run_cv_experiment(X, y, X_test, pipeline_creator, experiment_name, model_name, vectorizer_name, params, feature_names_func=None):
    """
    Run a cross-validation experiment and save results using ModelSaver.
    
    Args:
        X: Training features
        y: Training targets
        X_test: Test features
        pipeline_creator: Function that returns a fresh sklearn Pipeline
        experiment_name: Name of the experiment for saving
        model_name: Name of the model type
        vectorizer_name: Name of the vectorizer/feature set
        params: Dictionary containing 'model_params' and 'vectorizer_params'
        feature_names_func: Optional function to extract feature names from fitted pipeline
    """
    print(f"\n=== Running Experiment: {experiment_name} ===")
    
    saver = ModelSaver(base_path="experiments")
    saver.start_experiment(
        experiment_name=experiment_name,
        model_type=model_name,
        vectorizer=vectorizer_name,
        vectorizer_params=params.get('vectorizer_params', {}),
        model_params=params.get('model_params', {}),
        n_folds=N_FOLDS
    )
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        print(f"Fold {fold}/{N_FOLDS}")
        
        # Split data
        # Handle both numpy arrays and pandas series/dataframes if necessary, 
        # but we converted to numpy arrays in preparation step.
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Create and fit pipeline
        pipeline = pipeline_creator()
        pipeline.fit(X_train_fold, y_train_fold)
        
        # Validation metrics
        val_probs = pipeline.predict_proba(X_val_fold)[:, 1]
        val_metrics = calculate_metrics(y_val_fold, val_probs)
        val_metrics['fold'] = fold
        
        print(f"  Val AUC: {val_metrics['roc_auc']:.4f}")
        
        # Test predictions (for ensemble later)
        test_probs = pipeline.predict_proba(X_test)[:, 1]
        
        # Get feature names if possible
        feature_names = None
        if feature_names_func:
            try:
                feature_names = feature_names_func(pipeline)
            except Exception as e:
                print(f"  Could not extract feature names: {e}")
            
        saver.add_fold(
            fold_model=pipeline,
            fold_metric=val_metrics,
            test_predictions=test_probs,
            feature_names=feature_names
        )
        
    saver.finalize_experiment()
    print(f"Experiment saved to {saver._exp_dir}")

### 1. Logistic Regression (Engineered Numeric Features)

We first test a simple Logistic Regression model using only the manually engineered numeric features.


In [17]:
def create_numeric_lr_pipeline():
    return Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, solver='lbfgs'))
    ])

def get_numeric_feature_names(pipeline):
    return numeric_cols

numeric_params = {
    'model_params': {'max_iter': 1000, 'solver': 'lbfgs'},
    'vectorizer_params': {'type': 'StandardScaler'}
}

run_cv_experiment(
    X=X_numeric, 
    y=y, 
    X_test=X_numeric_test,
    pipeline_creator=create_numeric_lr_pipeline,
    experiment_name="exp_1_numeric_lr",
    model_name="LogisticRegression",
    vectorizer_name="NumericFeatures",
    params=numeric_params,
    feature_names_func=get_numeric_feature_names
)


=== Running Experiment: exp_1_numeric_lr ===
Experiment 'exp_1_numeric_lr' initialized at: experiments/exp_1_numeric_lr
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9450
  Fold 1/5 saved | ROC AUC: 0.9450
Fold 2/5
  Val AUC: 0.9369
  Fold 2/5 saved | ROC AUC: 0.9369
Fold 3/5
  Val AUC: 0.9431
  Fold 3/5 saved | ROC AUC: 0.9431
Fold 4/5
  Val AUC: 0.9459
  Fold 4/5 saved | ROC AUC: 0.9459
Fold 5/5
  Val AUC: 0.9404
  Fold 5/5 saved | ROC AUC: 0.9404

Finalizing experiment...
  Predictions saved to experiments/exp_1_numeric_lr/exp_1_numeric_lr_prediction.csv

✓ Experiment 'exp_1_numeric_lr' finalized!
  Location: experiments/exp_1_numeric_lr
  Folds completed: 5
  Best fold: 4 (ROC AUC: 0.9459)
  Average ROC AUC: 0.9423 ± 0.0033
Experiment saved to experiments/exp_1_numeric_lr
  Val AUC: 0.9404
  Fold 5/5 saved | ROC AUC: 0.9404

Finalizing experiment...
  Predictions saved to experiments/exp_1_numeric_lr/exp_1_numeric_lr_prediction.csv

✓ Experiment 'exp_1_numeric_lr' final

### 2. Logistic Regression (TF-IDF Features)

Next, we test Logistic Regression using TF-IDF features extracted directly from the URL strings. We use character n-grams to capture patterns in the URL structure.


In [18]:
def create_tfidf_lr_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, analyzer='char', ngram_range=(3, 5))),
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
    ])

def get_tfidf_feature_names(pipeline):
    return pipeline.named_steps['tfidf'].get_feature_names_out().tolist()

tfidf_params = {
    'model_params': {'max_iter': 1000},
    'vectorizer_params': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}
}

run_cv_experiment(
    X=X_text, 
    y=y, 
    X_test=X_text_test,
    pipeline_creator=create_tfidf_lr_pipeline,
    experiment_name="exp_1_tfidf_lr",
    model_name="LogisticRegression",
    vectorizer_name="TfidfVectorizer",
    params=tfidf_params,
    feature_names_func=get_tfidf_feature_names
)


=== Running Experiment: exp_1_tfidf_lr ===
Experiment 'exp_1_tfidf_lr' initialized at: experiments/exp_1_tfidf_lr
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9713
  Fold 1/5 saved | ROC AUC: 0.9713
Fold 2/5
  Val AUC: 0.9713
  Fold 1/5 saved | ROC AUC: 0.9713
Fold 2/5
  Val AUC: 0.9672
  Fold 2/5 saved | ROC AUC: 0.9672
Fold 3/5
  Val AUC: 0.9672
  Fold 2/5 saved | ROC AUC: 0.9672
Fold 3/5
  Val AUC: 0.9626
  Fold 3/5 saved | ROC AUC: 0.9626
Fold 4/5
  Val AUC: 0.9626
  Fold 3/5 saved | ROC AUC: 0.9626
Fold 4/5
  Val AUC: 0.9688
  Fold 4/5 saved | ROC AUC: 0.9688
Fold 5/5
  Val AUC: 0.9688
  Fold 4/5 saved | ROC AUC: 0.9688
Fold 5/5
  Val AUC: 0.9618
  Fold 5/5 saved | ROC AUC: 0.9618

Finalizing experiment...
  Predictions saved to experiments/exp_1_tfidf_lr/exp_1_tfidf_lr_prediction.csv

✓ Experiment 'exp_1_tfidf_lr' finalized!
  Location: experiments/exp_1_tfidf_lr
  Folds completed: 5
  Best fold: 1 (ROC AUC: 0.9713)
  Average ROC AUC: 0.9664 ± 0.0036
Experiment saved

### 3. Logistic Regression (Combined Features)

Since we see that tf-idf features perform better, lets try combining both feature sets.

In [30]:
from scipy.sparse import hstack

# Pre-compute combined features
tfidf_vectorizer = TfidfVectorizer(max_features=5000, analyzer='char', ngram_range=(3, 5))
X_text_tfidf = tfidf_vectorizer.fit_transform(X_text)

scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

X_combined = hstack([X_text_tfidf, X_numeric_scaled]).tocsr()
X_combined_test = hstack([tfidf_vectorizer.transform(X_text_test), scaler.transform(X_numeric_test)]).tocsr()

def create_combined_lr_pipeline():
    return Pipeline([
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
    ])

def get_combined_feature_names(pipeline):
    tfidf_features = tfidf_vectorizer.get_feature_names_out().tolist()
    return tfidf_features + numeric_cols

combined_params = {
    'model_params': {'max_iter': 1000},
    'vectorizer_params': {'tfidf': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}, 'scaler': 'StandardScaler'}
}

run_cv_experiment(
    X=X_combined, 
    y=y, 
    X_test=X_combined_test,
    pipeline_creator=create_combined_lr_pipeline,
    experiment_name="exp_1_combined_lr",
    model_name="LogisticRegression",
    vectorizer_name="CombinedFeatures",
    params=combined_params,
    feature_names_func=get_combined_feature_names
)


=== Running Experiment: exp_1_combined_lr ===
Experiment 'exp_1_combined_lr' initialized at: experiments/exp_1_combined_lr
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9807
  Fold 1/5 saved | ROC AUC: 0.9807
Fold 2/5
  Val AUC: 0.9807
  Fold 1/5 saved | ROC AUC: 0.9807
Fold 2/5
  Val AUC: 0.9787
  Fold 2/5 saved | ROC AUC: 0.9787
Fold 3/5
  Val AUC: 0.9787
  Fold 2/5 saved | ROC AUC: 0.9787
Fold 3/5
  Val AUC: 0.9783
  Fold 3/5 saved | ROC AUC: 0.9783
Fold 4/5
  Val AUC: 0.9783
  Fold 3/5 saved | ROC AUC: 0.9783
Fold 4/5
  Val AUC: 0.9835
  Fold 4/5 saved | ROC AUC: 0.9835
Fold 5/5
  Val AUC: 0.9835
  Fold 4/5 saved | ROC AUC: 0.9835
Fold 5/5
  Val AUC: 0.9775
  Fold 5/5 saved | ROC AUC: 0.9775

Finalizing experiment...
  Predictions saved to experiments/exp_1_combined_lr/exp_1_combined_lr_prediction.csv

✓ Experiment 'exp_1_combined_lr' finalized!
  Location: experiments/exp_1_combined_lr
  Folds completed: 5
  Best fold: 4 (ROC AUC: 0.9835)
  Average ROC AUC: 0.9797 ± 0.

### 4. SVM (Combined Features)

Our combined features seem to perform better overall, telling us that both feature sets contribute useful information. Lets try using SVM to see if accuracy improves further.

In [38]:
def create_combined_svm_pipeline():
    return Pipeline([
        ('clf', SVC(kernel='linear', C=1.0, random_state=RANDOM_STATE, probability=True))
    ])

combined_svm_params = {
    'model_params': {'kernel': 'linear', 'C': 1.0, 'probability': True},
    'vectorizer_params': {'tfidf': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}, 'scaler': 'StandardScaler'}
}

run_cv_experiment(
    X=X_combined, 
    y=y, 
    X_test=X_combined_test,
    pipeline_creator=create_combined_svm_pipeline,
    experiment_name="exp_1_combined_svm",
    model_name="SVM",
    vectorizer_name="CombinedFeatures",
    params=combined_svm_params,
    feature_names_func=get_combined_feature_names
)


=== Running Experiment: exp_1_combined_svm ===
Experiment 'exp_1_combined_svm' initialized at: experiments/exp_1_combined_svm
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9845
  Val AUC: 0.9845
  Fold 1/5 saved | ROC AUC: 0.9845
Fold 2/5
  Fold 1/5 saved | ROC AUC: 0.9845
Fold 2/5
  Val AUC: 0.9831
  Val AUC: 0.9831
  Fold 2/5 saved | ROC AUC: 0.9831
Fold 3/5
  Fold 2/5 saved | ROC AUC: 0.9831
Fold 3/5
  Val AUC: 0.9829
  Val AUC: 0.9829
  Fold 3/5 saved | ROC AUC: 0.9829
Fold 4/5
  Fold 3/5 saved | ROC AUC: 0.9829
Fold 4/5
  Val AUC: 0.9865
  Val AUC: 0.9865
  Fold 4/5 saved | ROC AUC: 0.9865
Fold 5/5
  Fold 4/5 saved | ROC AUC: 0.9865
Fold 5/5
  Val AUC: 0.9823
  Val AUC: 0.9823
  Fold 5/5 saved | ROC AUC: 0.9823

Finalizing experiment...
  Predictions saved to experiments/exp_1_combined_svm/exp_1_combined_svm_prediction.csv

✓ Experiment 'exp_1_combined_svm' finalized!
  Location: experiments/exp_1_combined_svm
  Folds completed: 5
  Best fold: 4 (ROC AUC: 0.9865)
  Ave