In [None]:
import torch
import torch.nn as nn
from torch.utils.data import random_split
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import pandas as pd
import json
from urllib.parse import urlparse, parse_qs
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# -------------------- Tokenization and Vocabulary --------------------
def tokenize_url(url):
    parsed = urlparse(url)
    tokens = []
    if parsed.scheme:
        tokens.append(parsed.scheme)
    if parsed.hostname:
        tokens += parsed.hostname.split('.')
    if parsed.path:
        tokens += parsed.path.strip("/").split("/")
    if parsed.query:
        tokens += list(parse_qs(parsed.query).keys())
    return tokens

def build_vocab(df, max_vocab_size=5000):
    token_counts = Counter()
    for url in df['text']:
        tokens = tokenize_url(url)
        token_counts.update(tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for i, (token, _) in enumerate(token_counts.most_common(max_vocab_size), start=2):
        vocab[token] = i
    return vocab

In [None]:
# -------------------- URL to Graph Conversion --------------------
def url_to_graph(url, label, vocab, max_nodes=30):
    tokens = tokenize_url(url)[:max_nodes]
    node_ids = [vocab.get(tok, vocab['<UNK>']) for tok in tokens]
    num_nodes = len(node_ids)
    if num_nodes < 2:
        return None
    edge_index = torch.tensor([[i, i + 1] for i in range(num_nodes - 1)], dtype=torch.long).t()
    x = torch.tensor(node_ids, dtype=torch.long).unsqueeze(1)
    return Data(x=x, edge_index=edge_index, y=torch.tensor([label], dtype=torch.float))
# -------------------- PyG Dataset --------------------
class URLGraphDataset(InMemoryDataset):
    def __init__(self, df, vocab):
        self.df = df.reset_index(drop=True)
        self.vocab = vocab
        super().__init__('.')
        data_list = [url_to_graph(url, label, vocab) for url, label in zip(df['text'], df['label']) if url_to_graph(url, label, vocab)]
        self.data, self.slices = self.collate(data_list)

In [None]:
# -------------------- GCN Model --------------------
class URLGNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = GCNConv(embed_dim, 128)
        self.conv2 = GCNConv(128, 64)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(64, 1)

    def forward(self, data):
        x = self.embedding(data.x.squeeze(1))
        x = torch.relu(self.conv1(x, data.edge_index))
        x = torch.relu(self.conv2(x, data.edge_index))
        x = global_mean_pool(x, data.batch)
        x = self.dropout(x)
        return torch.sigmoid(self.fc(x)).squeeze()


In [None]:
# -------------------- Train Function with TQDM + EarlyStopping --------------------
def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler=None, epochs=30, patience=5):
    best_model_wts = model.state_dict()
    best_val_f1 = 0
    patience_counter = 0
    history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': [], 'val_preds': [], 'val_probs': [], 'val_labels': []}

    for epoch in range(epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch)
            loss = criterion(out, batch.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch.num_graphs
            preds = (out > 0.5).int()
            correct += (preds == batch.y.int()).sum().item()
            total += batch.num_graphs
        train_loss = total_loss / total
        train_acc = correct / total
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)

        # Validation
        model.eval()
        val_loss, val_preds, val_probs, val_labels = 0, [], [], []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
                batch = batch.to(device)
                out = model(batch)
                loss = criterion(out, batch.y)
                val_loss += loss.item() * batch.num_graphs
                probs = out.cpu().numpy().tolist()
                preds = (out > 0.5).int().cpu().numpy().tolist()
                labels = batch.y.int().cpu().numpy().tolist()
                val_probs += probs
                val_preds += preds
                val_labels += labels
        val_loss /= len(val_loader.dataset)
        val_acc = accuracy_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds)

        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['val_preds'] = val_preds
        history['val_probs'] = val_probs
        history['val_labels'] = val_labels

        print(f"\nEpoch {epoch+1}: Train Loss={train_loss:.4f}, Acc={train_acc:.4f} | Val Loss={val_loss:.4f}, Acc={val_acc:.4f}, F1={val_f1:.4f}")

        if scheduler:
            scheduler.step(val_f1)

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_wts = model.state_dict()
            patience_counter = 0
            torch.save(model.state_dict(), "best_gnn_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("\n⛔ Early stopping triggered.")
                break

    model.load_state_dict(best_model_wts)
    return model, history


In [None]:
# -------------------- Plotting Functions --------------------
def plot_metrics(history):
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Val Loss')
    plt.title('Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(history['train_acc'], label='Train Acc')
    plt.plot(history['val_acc'], label='Val Acc')
    plt.title('Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Legit', 'Phish'], yticklabels=['Legit', 'Phish'])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

def plot_roc_curve(y_true, y_probs):
    fpr, tpr, _ = roc_curve(y_true, y_probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split
# -------------------- Main Pipeline --------------------
 # columns: URL, label
df = pd.read_json(r"E:\Phising_detection\dataset\urls\urls.json")

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")
vocab = build_vocab(train_df)
print(f"Vocabulary size: {len(vocab)}")
train_data = URLGraphDataset(train_df, vocab)
val_data = URLGraphDataset(val_df, vocab)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = URLGNN(vocab_size=len(vocab)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5)
criterion = nn.BCELoss()

model, history = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler=scheduler, epochs=30, patience=5)

plot_metrics(history)
plot_confusion_matrix(history['val_labels'], history['val_preds'])
plot_roc_curve(history['val_labels'], history['val_probs'])

torch.save(model.state_dict(), "best_gnn_model.pt")
with open("vocab.json", "w") as f:
    json.dump(vocab, f)