# Phishing URL Transformer Model Experiments

This notebook explores various transformer architectures using the Kaggle phishing URL dataset.

In increasing order of complexity, we will experiment with:

1. DeBERTa on URL only 
2. DeBERTa on URL and engineered features hybrid model 

## Setup and Imports

In [None]:
use_drive = False
base_path = "experiments"

# uncomment lines below if running on colab
from google.colab import drive
import os
drive.mount('/content/drive')
use_drive = True
drive_root = '/content/drive/MyDrive/fraud-grp-proj/'
base_path = os.path.join(drive_root, "experiments")
print(os.path.exists(drive_root)) # check path exists

Mounted at /content/drive
True


In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
import os
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, roc_curve)
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.optim as optim

# Model Tracking
from save_model import ModelSaver

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

Using device: cuda


In [None]:
# Load train and test datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

print(f"Train with features shape: {train_w_features_df.shape}")
print(f"Test with features shape: {test_w_features_df.shape}")

Train shape: (9143, 2)
Test shape: (2286, 2)
Train with features shape: (9143, 78)
Test with features shape: (2286, 78)


In [None]:
train_w_features_df.columns

Index(['url', 'target', 'is_http', 'has_subdomain', 'has_tld', 'num_subdomain',
       'is_domain_ip', 'num_hyphens_domain', 'is_punycode', 'has_path',
       'path_depth', 'has_filename', 'has_file_extension', 'has_query',
       'length_url', 'length_hostname', 'length_tld', 'length_sld',
       'length_subdomains', 'length_path', 'length_query', 'num_dots',
       'num_hyphens', 'num_at', 'num_question_marks', 'num_and', 'num_equal',
       'num_percent', 'tld_in_path', 'tld_in_subdomain',
       'subdomain_longer_sld', 'ratio_digits_url', 'ratio_digits_hostname',
       'ratio_letter_url', 'ratio_path_url', 'ratio_hostname_url',
       'length_words_url', 'avg_word_hostname', 'avg_word_path',
       'num_unique_chars_hostname', 'has_shortened_hostname',
       'entropy_hostname', 'has_www_subdomain', 'has_com_tld',
       'is_http_and_many_subdomains', 'ip_and_short_tld',
       'http_and_missing_domain_info', 'subdomain_depth_x_http', 'ip_x_http',
       'domain_complexity_score',

## DebBERTa based experiments 

Imports

In [None]:
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments


URL Only

In [None]:
class URLOnlyDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = tokenizer(
            row["url"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len
        )

        return {
            "input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(row["target"], dtype=torch.long)
        }


class URLWithFeaturesDataset(Dataset):
    def __init__(self, df, tokenizer, feature_cols, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.feature_cols = feature_cols
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = tokenizer(
            row["url"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len
        )

        features = row[self.feature_cols].values.astype(np.float32)

        return {
            "input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long),
            "features": torch.tensor(features, dtype=torch.float32),
            "labels": torch.tensor(row["target"], dtype=torch.long)
        }


def collate_fn(batch):
    out = {
        "input_ids": torch.stack([b["input_ids"] for b in batch]),
        "attention_mask": torch.stack([b["attention_mask"] for b in batch]),
        "labels": torch.stack([b["labels"] for b in batch])
    }
    if "features" in batch[0]:
        out["features"] = torch.stack([b["features"] for b in batch])
    return out


In [None]:
MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 256

class DebertaURLOnly(nn.Module):
    def __init__(self):
        super().__init__()
        self.deberta = AutoModel.from_pretrained(MODEL_NAME)
        hidden = self.deberta.config.hidden_size
        self.classifier = nn.Linear(hidden, 2)

    def forward(self, input_ids, attention_mask, labels=None):
        out = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]
        logits = self.classifier(cls)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {"loss": loss, "logits": logits}


class DebertaWithFeatures(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.deberta = AutoModel.from_pretrained(MODEL_NAME)
        hidden = self.deberta.config.hidden_size

        self.feature_proj = nn.Sequential(
            nn.Linear(num_features, 128),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

        self.classifier = nn.Linear(hidden + 128, 2)

    def forward(self, input_ids, attention_mask, features, labels=None):
        out = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]
        feat_emb = self.feature_proj(features)

        combined = torch.cat([cls, feat_emb], dim=1)
        logits = self.classifier(combined)

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {"loss": loss, "logits": logits}


In [None]:
def compute_metrics(pred):
    logits, labels = pred
    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "roc_auc": roc_auc_score(labels, probs)
    }


In [None]:
def run_transformer_experiment(model_type, experiment_name):
    print(f"\n=== Running Experiment: {experiment_name} ({model_type}) ===")

    # Initialize ModelSaver
    saver = ModelSaver(base_path=base_path)
    saver.start_experiment(
        experiment_name=experiment_name,
        model_type="DeBERTa Transformer",
        vectorizer="DeBERTa Tokenizer",
        vectorizer_params={"model": MODEL_NAME, "max_len": MAX_LEN},
        model_params={"learning_rate": 2e-5, "batch_size": 16, "epochs": 2},
        n_folds=5,
        save_format="transformers"
    )

    feature_cols = [c for c in train_w_features_df.columns if c not in ["url", "target"]]
    num_features = len(feature_cols)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_results = []

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df["target"]), start=1):
        print(f"\n--- Fold {fold_idx}/5 ---")

        # URL-ONLY MODEL
        if model_type == "url_only":
            train_fold = train_df.iloc[train_idx]
            val_fold   = train_df.iloc[val_idx]

            train_dataset = URLOnlyDataset(train_fold, tokenizer)
            val_dataset   = URLOnlyDataset(val_fold, tokenizer)
            
            # Test dataset for this fold
            test_dataset = URLOnlyDataset(test_df, tokenizer)

            model = DebertaURLOnly()
            data_collator = None

        # URL + FEATURES MODEL
        else:
            train_fold = train_w_features_df.iloc[train_idx]
            val_fold   = train_w_features_df.iloc[val_idx]

            train_dataset = URLWithFeaturesDataset(train_fold, tokenizer, feature_cols)
            val_dataset   = URLWithFeaturesDataset(val_fold, tokenizer, feature_cols)
            
            # Test dataset
            test_dataset = URLWithFeaturesDataset(test_w_features_df, tokenizer, feature_cols)

            model = DebertaWithFeatures(num_features)
            data_collator = collate_fn

        # TRAINING ARGUMENTS
        args = TrainingArguments(
            output_dir=f"./tmp_{experiment_name}_fold_{fold_idx}",
            eval_strategy="epoch",
            save_strategy="no",
            learning_rate=2e-5,
            weight_decay=0.01,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            num_train_epochs=2,
            logging_steps=50,
            remove_unused_columns=False,
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            data_collator=data_collator
        )

        # TRAIN
        trainer.train()

        # VALIDATION RESULTS
        metrics = trainer.evaluate()
        print(f" Fold {fold_idx} ROC AUC: {metrics['eval_roc_auc']:.4f}")
        
        # Prepare metrics for saver (strip 'eval_' prefix)
        fold_metrics = {k.replace("eval_", ""): v for k, v in metrics.items()}
        fold_metrics["fold"] = fold_idx
        
        fold_results.append(fold_metrics)
        
        # PREDICT ON TEST SET
        test_preds = trainer.predict(test_dataset)
        # Apply softmax to get probabilities for class 1
        test_probs = torch.softmax(torch.tensor(test_preds.predictions), dim=1)[:, 1].numpy()
        
        # SAVE FOLD
        saver.add_fold(
            fold_model=(model, tokenizer),
            fold_metric=fold_metrics,
            test_predictions=test_probs
        )

    # FINALIZE EXPERIMENT
    saver.finalize_experiment()

    return fold_results


Run Models

In [None]:
results_url_only = run_transformer_experiment(
    "url_only",
    "exp_4_deberta_url_only"
)

In [None]:
results_url_features = run_transformer_experiment(
    "url_features",
    "exp_4_deberta_url_features"
)
