In [1]:
import re
import math
import urllib

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score

# Preprocessing 

In [2]:
df1 = pd.read_csv('PhiUSIIL_Phishing_URL_Dataset.csv')
df2 = pd.read_csv('train.csv')

df1 = df1[['URL', 'label']]
df2 = df2.rename(columns={'url': 'URL', 'result': 'label'})

df = pd.concat([df1, df2], axis=0, ignore_index=True)
df.drop_duplicates(inplace=True)

duplicate_mask = df.duplicated(subset=['URL'], keep=False)
conflicting_urls = df[duplicate_mask]['URL'].unique()
df_clean = df[~df['URL'].isin(conflicting_urls)].reset_index(drop=True)
print(f"Deleted rows with conflicts: {len(df) - len(df_clean)}")
df = df_clean

Deleted rows with conflicts: 40


In [3]:
X_urls = df.URL.values
y = df.label.values

X_temp, X_train_urls, y_temp, y_train = train_test_split(
    X_urls, y, test_size=0.7, stratify=y, random_state=42
)
X_val_urls, X_test_urls, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

In [4]:
def entropy(s):
    if not s:
        return 0.0
    prob = [s.count(c) / len(s) for c in set(s)]
    return -sum(p * math.log2(p) for p in prob)

def rule_based_phish(url):
    return '@' in url

def extract_features(url):
    has_scheme_orig = url.startswith(('http://', 'https://'))
    parse_url = url if has_scheme_orig else 'http://' + url
    
    try:
        parsed = urllib.parse.urlparse(parse_url)
        netloc = parsed.netloc
        
        if netloc.startswith('xn--'):
            netloc = netloc.encode('ascii').decode('idna')
        
        normalized_url = parsed._replace(netloc=netloc).geturl()
        url = normalized_url
    except:
        pass
    
    has_scheme = url.startswith(('http://', 'https://'))
    parse_url = url if has_scheme else 'http://' + url
    
    try:
        parsed = urllib.parse.urlparse(parse_url)
        netloc = parsed.netloc.lower()
        path = parsed.path.lower()
    except:
        netloc = path = ""
    
    num_slashes = url.count('/')
    num_exclam = url.count('!')
    num_at = url.count('@')
    num_dollar = url.count('$')
    num_dots = netloc.count('.')
    len_netloc = len(netloc)
    dot_density = num_dots / (len_netloc + 1e-6)
    len_url = len(url)
    path_depth = max(0, num_slashes - 2)
    
    parts = netloc.split('.') if netloc else []
    subdomain = '.'.join(parts[:-2]) if len(parts) > 2 else ""
    is_random_subdomain = (
        len(subdomain) > 10 and 
        subdomain.isalnum() and 
        not any(brand in subdomain for brand in ['www', 'mail', 'blog'])
    )
    
    random_strings = re.findall(r'[a-zA-Z0-9]{10,}', path)
    has_random_string_in_path = len(random_strings) > 0
    
    clean_netloc = netloc.replace('.', '').replace('-', '')
    domain_entropy = entropy(clean_netloc) if clean_netloc else 0.0
    
    part_lengths = [len(part) for part in parts] if parts else [0]
    domain_length_std = np.std(part_lengths) if len(part_lengths) > 1 else 0.0
    
    digit_count = sum(c.isdigit() for c in netloc)
    digit_ratio = digit_count / (len(netloc) + 1e-6)
    
    return np.array([
        num_slashes,
        num_exclam,
        num_at,
        num_dollar,
        dot_density,
        len_netloc,
        len_url,
        path_depth,
        int(is_random_subdomain),
        int(has_random_string_in_path),
        domain_entropy,
        domain_length_std,
        digit_ratio
    ], dtype=np.float32)

In [5]:
MAX_LEN = 256
CHARS = "".join(sorted(set(
    "abcdefghijklmnopqrstuvwxyz"
    "0123456789"
    "-._~"
    ":/?#[]@!$&'()*+,;="
    "`{}|\\^%\"<> "
)))
char_to_id = {ch: i + 1 for i, ch in enumerate(CHARS)}

def url_to_seq(url, max_len=MAX_LEN):
    if not isinstance(url, str):
        url = ""
    url = url.lower()
    seq = [char_to_id.get(c, 0) for c in url[:max_len]]
    return seq + [0] * (max_len - len(seq))

# Dataset

In [6]:
class HybridDataset(Dataset):
    def __init__(self, urls, labels=None, scaler=None, fit_scaler=False):
        self.urls = urls
        self.labels = labels
        
        self.seqs = np.array([url_to_seq(url) for url in urls], dtype=np.int64)
        
        hc_features = np.array([extract_features(url) for url in urls], dtype=np.float32)
        if fit_scaler:
            self.scaler = StandardScaler()
            self.hc_features = self.scaler.fit_transform(hc_features)
        elif scaler is not None:
            self.hc_features = scaler.transform(hc_features)
        else:
            self.hc_features = hc_features

    def __len__(self):
        return len(self.urls)

    def __getitem__(self, idx):
        seq = self.seqs[idx]
        hc = self.hc_features[idx]
        if self.labels is not None:
            y = float(self.labels[idx])
            return (torch.tensor(seq), torch.tensor(hc)), torch.tensor(y, dtype=torch.float32)
        return (torch.tensor(seq), torch.tensor(hc))

In [7]:
train_dataset = HybridDataset(X_train_urls, y_train, fit_scaler=True)
scaler = train_dataset.scaler
val_dataset = HybridDataset(X_val_urls, y_val, scaler=scaler)
test_dataset = HybridDataset(X_test_urls, y_test, scaler=scaler)

BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import joblib
joblib.dump(scaler, 'scaler.pkl')

# Model

In [8]:
class HybridPhishNet(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, kernel_sizes=[3, 4, 5, 6, 7, 8], num_filters=128, 
                 handcrafted_dim=13, hidden_cnn=128, hidden_hc=8, hidden_combined=16):
        super().__init__()
        
        # CNN
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(embed_dim, num_filters, k),
                nn.ReLU(),
                nn.AdaptiveMaxPool1d(1)
            ) for k in kernel_sizes
        ])
        self.cnn_classifier = nn.Linear(num_filters * len(kernel_sizes), hidden_cnn)
        
        # Handcrafted
        self.hc_net = nn.Sequential(
            nn.Linear(handcrafted_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(16, hidden_hc)
        )
        
        # Joining
        self.combined_net = nn.Sequential(
            nn.Linear(hidden_cnn + hidden_hc, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_combined, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x_seq, x_hc):
        # CNN
        emb = self.embedding(x_seq).permute(0, 2, 1)
        cnn_features = torch.cat([conv(emb).squeeze(-1) for conv in self.convs], dim=1)
        cnn_out = self.cnn_classifier(cnn_features)
        
        # Handcrafted
        hc_out = self.hc_net(x_hc)
        
        # Joining
        combined = torch.cat([cnn_out, hc_out], dim=1)
        return self.combined_net(combined).squeeze(-1)

# Learning

In [9]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for (x_seq, x_hc), y in dataloader:
        x_seq, x_hc, y = x_seq.to(device), x_hc.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(x_seq, x_hc)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for (x_seq, x_hc), y in dataloader:
            x_seq, x_hc, y = x_seq.to(device), x_hc.to(device), y.to(device)
            outputs = model(x_seq, x_hc)
            all_preds.append(outputs.cpu().numpy())
            all_labels.append(y.cpu().numpy())
    return np.concatenate(all_preds), np.concatenate(all_labels)

def find_best_threshold_f2(y_true, y_proba):
    best_thresh = 0.5
    best_f2 = 0.0
    for thresh in np.arange(0.01, 0.5, 0.01):
        y_pred = (y_proba > thresh).astype(int)
        f2 = fbeta_score(y_true, y_pred, beta=2.0, pos_label=1)
        if f2 > best_f2:
            best_f2 = f2
            best_thresh = thresh
    print(f"Best F2 threshold: {best_thresh:.2f} - F2: {best_f2:.4f}")
    return best_thresh

In [10]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PATIENCE = 7
EPOCHS = 30

In [11]:
model = HybridPhishNet(vocab_size=len(CHARS) + 1).to(DEVICE)

criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5)

In [12]:
total_params = 0
for param in model.parameters():
    total_params += param.numel()

print(f"Total params: {total_params}")

Total params: 661337


In [13]:
best_f2 = 0
patience_counter = 0

for epoch in range(EPOCHS):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, DEVICE)
    val_preds, val_labels = evaluate(model, val_loader, DEVICE)
    
    current_f2 = fbeta_score(val_labels, (val_preds > 0.3).astype(int), beta=2.0, pos_label=1)
    scheduler.step(current_f2)
    
    print(f"Epoch {epoch+1:2d} | Loss: {train_loss:.4f} | Val F2: {current_f2:.4f}")
    
    if current_f2 > best_f2:
        best_f2 = current_f2
        patience_counter = 0
        torch.save(model.state_dict(), 'best_hybrid_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("Early stopping")
            break

Epoch  1 | Loss: 0.1676 | Val F2: 0.9534
Epoch  2 | Loss: 0.1072 | Val F2: 0.9561
Epoch  3 | Loss: 0.0874 | Val F2: 0.9600
Epoch  4 | Loss: 0.0713 | Val F2: 0.9696
Epoch  5 | Loss: 0.0622 | Val F2: 0.9608
Epoch  6 | Loss: 0.0538 | Val F2: 0.9619
Epoch  7 | Loss: 0.0472 | Val F2: 0.9605
Epoch  8 | Loss: 0.0257 | Val F2: 0.9654
Epoch  9 | Loss: 0.0209 | Val F2: 0.9701
Epoch 10 | Loss: 0.0163 | Val F2: 0.9716
Epoch 11 | Loss: 0.0141 | Val F2: 0.9684
Epoch 12 | Loss: 0.0115 | Val F2: 0.9686
Epoch 13 | Loss: 0.0110 | Val F2: 0.9643
Epoch 14 | Loss: 0.0052 | Val F2: 0.9718
Epoch 15 | Loss: 0.0043 | Val F2: 0.9721
Epoch 16 | Loss: 0.0036 | Val F2: 0.9683
Epoch 17 | Loss: 0.0041 | Val F2: 0.9683
Epoch 18 | Loss: 0.0029 | Val F2: 0.9656
Epoch 19 | Loss: 0.0019 | Val F2: 0.9654
Epoch 20 | Loss: 0.0018 | Val F2: 0.9675
Epoch 21 | Loss: 0.0019 | Val F2: 0.9697
Epoch 22 | Loss: 0.0015 | Val F2: 0.9662
Early stopping


# Evaluation

In [None]:
model.load_state_dict(torch.load('best_model.pth'))

val_preds, val_labels = evaluate(model, val_loader, DEVICE)
best_thresh = find_best_threshold_f2(val_labels, val_preds)
test_preds, test_labels = evaluate(model, test_loader, DEVICE)

test_pred_final = []
for i, url in enumerate(X_test_urls):
    if rule_based_phish(url):
        test_pred_final.append(1) 
    else:
        test_pred_final.append(int(test_preds[i] > best_thresh))

test_pred_final = np.array(test_pred_final)

Best F2 threshold: 0.06 - F2: 0.9729


In [17]:
test_recall = recall_score(test_labels, test_pred_final, pos_label=1)
test_precision = precision_score(test_labels, test_pred_final, pos_label=1)
test_f1 = f1_score(test_labels, test_pred_final, pos_label=1)
test_f2 = fbeta_score(test_labels, test_pred_final, beta=2.0, pos_label=1)

print(f"Recall:        {test_recall:.4f}")
print(f"Precision:     {test_precision:.4f}")
print(f"F1-score:      {test_f1:.4f}")
print(f"F2-score:      {test_f2:.4f}")

Recall:        0.9812
Precision:     0.9355
F1-score:      0.9578
F2-score:      0.9717
