# Phishing URL Transformer Model Experiments

This notebook explores various transformer architectures using the Kaggle phishing URL dataset.

In increasing order of complexity, we will experiment with:

1. DeBERTa on URL only
2. DeBERTa on URL and Engineered Features

## Setup and Imports

In [1]:
use_drive = False

from google.colab import drive
import os
drive.mount('/content/drive')
use_drive = True
drive_root = '/content/drive/MyDrive/fraud-grp-proj/'
print(os.path.exists(drive_root)) # check path exists

base_path = os.path.join(drive_root, "experiments")

from save_model import ModelSaver


Mounted at /content/drive
True


In [2]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, roc_curve)
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.optim as optim

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

Using device: cuda


In [3]:
# Load train and test datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

print(f"Train with features shape: {train_w_features_df.shape}")
print(f"Test with features shape: {test_w_features_df.shape}")

Train shape: (9143, 2)
Test shape: (2286, 2)
Train with features shape: (9143, 78)
Test with features shape: (2286, 78)


In [4]:
train_w_features_df.columns

Index(['url', 'target', 'is_http', 'has_subdomain', 'has_tld', 'num_subdomain',
       'is_domain_ip', 'num_hyphens_domain', 'is_punycode', 'has_path',
       'path_depth', 'has_filename', 'has_file_extension', 'has_query',
       'length_url', 'length_hostname', 'length_tld', 'length_sld',
       'length_subdomains', 'length_path', 'length_query', 'num_dots',
       'num_hyphens', 'num_at', 'num_question_marks', 'num_and', 'num_equal',
       'num_percent', 'tld_in_path', 'tld_in_subdomain',
       'subdomain_longer_sld', 'ratio_digits_url', 'ratio_digits_hostname',
       'ratio_letter_url', 'ratio_path_url', 'ratio_hostname_url',
       'length_words_url', 'avg_word_hostname', 'avg_word_path',
       'num_unique_chars_hostname', 'has_shortened_hostname',
       'entropy_hostname', 'has_www_subdomain', 'has_com_tld',
       'is_http_and_many_subdomains', 'ip_and_short_tld',
       'http_and_missing_domain_info', 'subdomain_depth_x_http', 'ip_x_http',
       'domain_complexity_score',

#Transformer-Based Experiments


In [5]:
MODEL_NAME = "microsoft/deberta-v3-base"
MAX_LEN = 256

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [6]:
class URLOnlyDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=MAX_LEN):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.tokenizer(
            row["url"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len
        )
        return {
            "input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(row["target"], dtype=torch.long),
        }


In [7]:
class URLWithFeaturesDataset(Dataset):
    def __init__(self, df, tokenizer, feature_cols, max_len=MAX_LEN):
        self.df = df
        self.tokenizer = tokenizer
        self.feature_cols = feature_cols
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.tokenizer(
            row["url"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len
        )
        features = row[self.feature_cols].values.astype("float32")

        return {
            "input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long),
            "features": torch.tensor(features, dtype=torch.float32),
            "labels": torch.tensor(row["target"], dtype=torch.long),
        }


In [8]:
def collate_fn(batch):
    out = {
        "input_ids": torch.stack([b["input_ids"] for b in batch]),
        "attention_mask": torch.stack([b["attention_mask"] for b in batch]),
        "labels": torch.stack([b["labels"] for b in batch]),
    }
    if "features" in batch[0]:
        out["features"] = torch.stack([b["features"] for b in batch])
    return out


In [9]:
from transformers import AutoModelForSequenceClassification

def DebertaURLOnly():
    return AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2
    )


In [10]:
from transformers import AutoModel, AutoConfig, PreTrainedModel, DebertaV2Config

# Define our custom config by inheriting from the specific base model config
class DebertaWithFeaturesConfig(DebertaV2Config):
    model_type = "deberta_with_features" # Custom model_type for identification

    def __init__(self, num_features=0, **kwargs):
        super().__init__(**kwargs) # Call the parent DebertaV2Config constructor
        self.num_features = num_features # Add our custom attribute


class DebertaWithFeatures(PreTrainedModel):
    config_class = DebertaWithFeaturesConfig

    def __init__(self, config):
        super().__init__(config)

        # Load pretrained DeBERTa encoder using its standard config.
        # AutoModel.from_pretrained will automatically load the correct DebertaV2Config
        # and weights based on MODEL_NAME.
        self.deberta = AutoModel.from_pretrained(MODEL_NAME)

        # Ensure hidden size is retrieved from the DeBERTa model's config
        hidden = self.deberta.config.hidden_size

        # Feature projection - `config` here is DebertaWithFeaturesConfig, which has num_features
        self.feature_proj = nn.Sequential(
            nn.Linear(config.num_features, 128),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

        # Combined classifier
        self.classifier = nn.Linear(hidden + 128, 2)

    def forward(self, input_ids, attention_mask, features, labels=None):
        out = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]

        feat_emb = self.feature_proj(features)
        combined = torch.cat([cls, feat_emb], dim=1)

        logits = self.classifier(combined)

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)

        return {"loss": loss, "logits": logits}


def create_deberta_with_features(num_features):
    # Load the standard DeBERTaV2Config from the pretrained model
    base_deberta_config = DebertaV2Config.from_pretrained(MODEL_NAME)

    # Create an instance of our custom config,
    # incorporating all parameters from the base config and adding num_features
    config = DebertaWithFeaturesConfig(
        num_features=num_features,
        **base_deberta_config.to_dict()
    )
    return DebertaWithFeatures(config)

In [11]:
def compute_metrics(pred):
    logits, labels = pred
    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    preds = np.argmax(logits, axis=1)

    cm = confusion_matrix(labels, preds)

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "roc_auc": roc_auc_score(labels, probs),
        "TP": cm[1, 1],
        "FP": cm[0, 1],
        "TN": cm[0, 0],
        "FN": cm[1, 0]
    }


In [12]:
from transformers import TrainingArguments, Trainer

def run_transformer_experiment(model_type, experiment_name):
    print(f"\n=== Running {experiment_name} ({model_type}) ===")

    saver = ModelSaver(base_path=base_path)
    saver.start_experiment(
        experiment_name=experiment_name,
        model_type="DeBERTa Transformer",
        vectorizer="DeBERTa Tokenizer",
        vectorizer_params={"model": MODEL_NAME, "max_len": MAX_LEN},
        model_params={"epochs": 2, "batch_size": 16},
        n_folds=5,
        save_format="transformers"
    )

    # Preprocess categorical columns for both train and test dataframes
    global train_w_features_df, test_w_features_df

    # List of categorical columns to one-hot encode
    categorical_cols_to_encode = [col for col in train_w_features_df.select_dtypes(include=['object']).columns if col not in ['url']]
    if "target" in categorical_cols_to_encode: # ensure target is not encoded if it's an object type
      categorical_cols_to_encode.remove("target")

    if categorical_cols_to_encode:
        print(f"One-hot encoding columns: {categorical_cols_to_encode}")
        train_w_features_df = pd.get_dummies(train_w_features_df, columns=categorical_cols_to_encode, prefix=[f'ohe_{col}' for col in categorical_cols_to_encode])
        test_w_features_df = pd.get_dummies(test_w_features_df, columns=categorical_cols_to_encode, prefix=[f'ohe_{col}' for col in categorical_cols_to_encode])

        # Align columns between train and test after one-hot encoding
        train_cols = set(train_w_features_df.columns)
        test_cols = set(test_w_features_df.columns)

        missing_in_test = list(train_cols - test_cols)
        for col in missing_in_test:
            if col != 'url' and col != 'target': # Exclude url and target from adding as zero columns
                test_w_features_df[col] = 0

        missing_in_train = list(test_cols - train_cols)
        for col in missing_in_train:
            if col != 'url' and col != 'target': # Exclude url and target from adding as zero columns
                train_w_features_df[col] = 0

        # Ensure columns are in the same order
        test_w_features_df = test_w_features_df[train_w_features_df.columns]

    feature_cols = [c for c in train_w_features_df.columns if c not in ["url", "target"]]
    num_features = len(feature_cols)

    # Prepare test dataset
    if model_type == "url_only":
        test_dataset = URLOnlyDataset(test_df, tokenizer)
        test_data_collator = None
    else:
        test_dataset = URLWithFeaturesDataset(test_w_features_df, tokenizer, feature_cols)
        test_data_collator = collate_fn

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_results = []

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df["target"]), start=1):
        print(f"\n--- Fold {fold_idx}/5 ---")

        if model_type == "url_only":
            train_fold = train_df.iloc[train_idx]
            val_fold = train_df.iloc[val_idx]

            train_dataset = URLOnlyDataset(train_fold, tokenizer)
            val_dataset = URLOnlyDataset(val_fold, tokenizer)

            model = DebertaURLOnly()
            data_collator = None

        else:
            train_fold = train_w_features_df.iloc[train_idx]
            val_fold = train_w_features_df.iloc[val_idx]

            train_dataset = URLWithFeaturesDataset(train_fold, tokenizer, feature_cols)
            val_dataset = URLWithFeaturesDataset(val_fold, tokenizer, feature_cols)

            model = create_deberta_with_features(num_features)
            data_collator = collate_fn

        args = TrainingArguments(
            output_dir=f"./tmp_{experiment_name}_fold_{fold_idx}",
            eval_strategy="epoch",
            save_strategy="no",
            learning_rate=2e-5,
            weight_decay=0.01,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            num_train_epochs=2,
            logging_steps=50,
            remove_unused_columns=False,
            report_to="none",
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            data_collator=data_collator
        )

        trainer.train()
        metrics = trainer.evaluate()
        print(f"Fold {fold_idx} ROC AUC: {metrics['eval_roc_auc']:.4f}")

        # Predict on test set
        test_preds = trainer.predict(test_dataset)
        test_logits = test_preds.predictions
        test_probs = torch.softmax(torch.tensor(test_logits), dim=1)[:, 1].numpy()

        fold_metrics = {k.replace("eval_", ""): v for k, v in metrics.items()}
        fold_metrics["fold"] = fold_idx

        saver.add_fold(
            fold_model=(model, tokenizer),
            fold_metric=fold_metrics,
            test_predictions=test_probs
        )

        fold_results.append(fold_metrics)

    saver.finalize_experiment()
    return fold_results

In [None]:
results = run_transformer_experiment("url_only", "exp_4_deberta_url_only")



=== Running exp_4_deberta_url_only (url_only) ===
Experiment 'exp_4_deberta_url_only' initialized at: /content/drive/MyDrive/fraud-grp-proj/experiments/exp_4_deberta_url_only
Mode: Incremental saving (5 folds)

--- Fold 1/5 ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.1863,0.149788,0.954073,0.942492,0.967213,0.954693,0.9887,885,54,860,30
2,0.1304,0.127127,0.965008,0.973304,0.956284,0.964719,0.993154,875,24,890,40


Fold 1 ROC AUC: 0.9932
  Fold 1/5 saved | ROC AUC: 0.9932

--- Fold 2/5 ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.114,0.262239,0.938764,0.975118,0.900438,0.936291,0.986608,823,21,894,91
2,0.1384,0.201711,0.953527,0.963128,0.943107,0.953013,0.989684,862,33,882,52


Fold 2 ROC AUC: 0.9897
  Fold 2/5 saved | ROC AUC: 0.9897

--- Fold 3/5 ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.1489,0.180292,0.946966,0.964733,0.92779,0.945901,0.989751,848,31,884,66
2,0.0868,0.195419,0.947512,0.967963,0.925602,0.946309,0.990808,846,28,887,68


Fold 3 ROC AUC: 0.9908
  Fold 3/5 saved | ROC AUC: 0.9908

--- Fold 4/5 ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.1403,0.21712,0.943107,0.983294,0.901532,0.940639,0.99081,824,14,900,90
2,0.1272,0.143299,0.960066,0.97407,0.945295,0.959467,0.992072,864,23,891,50


Fold 4 ROC AUC: 0.9921
  Fold 4/5 saved | ROC AUC: 0.9921

--- Fold 5/5 ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.1747,0.192277,0.945295,0.972158,0.916849,0.943694,0.988695,838,24,890,76
2,0.0934,0.15434,0.96116,0.979522,0.942013,0.960402,0.991718,861,18,896,53


Fold 5 ROC AUC: 0.9917
  Fold 5/5 saved | ROC AUC: 0.9917

Finalizing experiment...
  Predictions saved to /content/drive/MyDrive/fraud-grp-proj/experiments/exp_4_deberta_url_only/exp_4_deberta_url_only_prediction.csv

✓ Experiment 'exp_4_deberta_url_only' finalized!
  Location: /content/drive/MyDrive/fraud-grp-proj/experiments/exp_4_deberta_url_only
  Folds completed: 5
  Best fold: 1 (ROC AUC: 0.9932)
  Average ROC AUC: 0.9915 ± 0.0012


In [13]:
results2 = run_transformer_experiment("url_features", "exp_4_deberta_url_features")



=== Running exp_4_deberta_url_features (url_features) ===
Experiment 'exp_4_deberta_url_features' initialized at: /content/drive/MyDrive/fraud-grp-proj/experiments/exp_4_deberta_url_features
Mode: Incremental saving (5 folds)
One-hot encoding columns: ['homoglyph_type', 'num_subdomain_bucketed', 'length_tld_bucketed', 'path_depth_bucketed']

--- Fold 1/5 ---


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.2094,0.159075,0.957354,0.970754,0.943169,0.956763,0.988407,863,26,888,52
2,0.118,0.144753,0.961728,0.967885,0.955191,0.961496,0.991385,874,29,885,41


Fold 1 ROC AUC: 0.9914
  Fold 1/5 saved | ROC AUC: 0.9914

--- Fold 2/5 ---


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.126,0.24226,0.944232,0.960317,0.926696,0.943207,0.98613,847,35,880,67
2,0.1474,0.238078,0.948059,0.959596,0.935449,0.947368,0.987876,855,36,879,59


Fold 2 ROC AUC: 0.9879
  Fold 2/5 saved | ROC AUC: 0.9879

--- Fold 3/5 ---


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.1625,0.142498,0.956807,0.962348,0.950766,0.956522,0.991509,869,34,881,45
2,0.1105,0.195123,0.945325,0.973256,0.915755,0.94363,0.991315,837,23,892,77


Fold 3 ROC AUC: 0.9913
  Fold 3/5 saved | ROC AUC: 0.9913

--- Fold 4/5 ---


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.136,0.182987,0.95186,0.974713,0.92779,0.950673,0.990634,848,22,892,66
2,0.1151,0.16853,0.960613,0.965708,0.955142,0.960396,0.991162,873,31,883,41


Fold 4 ROC AUC: 0.9912
  Fold 4/5 saved | ROC AUC: 0.9912

--- Fold 5/5 ---


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Tp,Fp,Tn,Fn
1,0.1694,0.183429,0.952954,0.954945,0.950766,0.952851,0.987456,869,41,873,45
2,0.1004,0.165513,0.962254,0.977401,0.946389,0.961645,0.990262,865,20,894,49


Fold 5 ROC AUC: 0.9903
  Fold 5/5 saved | ROC AUC: 0.9903

Finalizing experiment...
  Predictions saved to /content/drive/MyDrive/fraud-grp-proj/experiments/exp_4_deberta_url_features/exp_4_deberta_url_features_prediction.csv

✓ Experiment 'exp_4_deberta_url_features' finalized!
  Location: /content/drive/MyDrive/fraud-grp-proj/experiments/exp_4_deberta_url_features
  Folds completed: 5
  Best fold: 1 (ROC AUC: 0.9914)
  Average ROC AUC: 0.9904 ± 0.0013
