In [None]:
import string
import torch
import numpy as np

# Define the character vocabulary (printable characters)
CHAR_VOCAB = list(string.ascii_letters + string.digits + string.punctuation)
char2idx = {c: i+1 for i, c in enumerate(CHAR_VOCAB)}  # +1 to reserve 0 for padding
vocab_size = len(char2idx) + 1
max_len = 200  # Max URL length (pad/truncate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def encode_url(url, max_len=max_len):
    url = url[:max_len].ljust(max_len)  # pad right
    return [char2idx.get(c, 0) for c in url]  # unknown chars as 0


In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class URLDataset(Dataset):
    def __init__(self, df):
        self.urls = df['text'].astype(str).values
        self.labels = df['label'].values

    def __len__(self):
        return len(self.urls)

    def __getitem__(self, idx):
        url = self.urls[idx]
        x = torch.tensor(encode_url(url), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float)
        return x, y


In [None]:
import torch.nn as nn

class CharCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        x = self.relu(self.conv1(x))
        x = self.pool(x).squeeze(-1)
        x = self.dropout(x)
        return self.sigmoid(self.fc(x)).squeeze()


In [None]:
from tqdm import tqdm
from sklearn.metrics import f1_score
import torch.optim as optim

def train_model(model, train_loader, val_loader, epochs=20, patience=3, lr=1e-3):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1)

    best_f1 = 0
    patience_counter = 0

    history = {
        'train_loss': [], 'val_loss': [],
        'train_acc': [], 'val_acc': [],
        'f1': [], 'val_f1': [],
        'val_y_true': [], 'val_y_pred': [], 'val_y_prob': []
    }

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        train_preds, train_targets = [], []

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]", leave=False)
        for x, y in loop:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * y.size(0)
            preds = (out > 0.5).float()
            train_correct += (preds == y).sum().item()
            train_total += y.size(0)

            train_preds += preds.cpu().numpy().tolist()
            train_targets += y.cpu().numpy().tolist()

            loop.set_postfix(loss=loss.item())

        train_loss /= train_total
        train_acc = train_correct / train_total
        train_f1 = f1_score(train_targets, train_preds)

        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        y_true, y_pred, y_prob = [], [], []

        loop = tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]", leave=False)
        with torch.no_grad():
            for x, y in loop:
                x, y = x.to(device), y.to(device)
                out = model(x)
                loss = criterion(out, y)

                val_loss += loss.item() * y.size(0)
                probs = out.cpu()
                preds = (probs > 0.5).float()

                val_correct += (preds == y.cpu()).sum().item()
                val_total += y.size(0)

                y_prob += probs.numpy().tolist()
                y_pred += preds.numpy().tolist()
                y_true += y.cpu().numpy().tolist()

        val_loss /= val_total
        val_acc = val_correct / val_total
        val_f1 = f1_score(y_true, y_pred)

        # Save history
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)
        history['f1'].append(train_f1)
        history['val_f1'].append(val_f1)
        history['val_y_true'] = y_true
        history['val_y_pred'] = y_pred
        history['val_y_prob'] = y_prob

        print(f"\nEpoch {epoch+1}: "
              f"Train Loss={train_loss:.4f}, Acc={train_acc:.4f}, F1={train_f1:.4f} | "
              f"Val Loss={val_loss:.4f}, Acc={val_acc:.4f}, F1={val_f1:.4f}")

        scheduler.step(val_f1)

        if val_f1 > best_f1:
            best_f1 = val_f1
            patience_counter = 0
            torch.save(model.state_dict(), "best_charcnn.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    return history


In [None]:
from matplotlib import pyplot as plt
def plot_loss(history):
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Val Loss')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_accuracy(history):
    plt.plot(history['train_acc'], label='Train Accuracy')
    plt.plot(history['val_acc'], label='Val Accuracy')
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Training vs Validation Accuracy")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
def plot_f1_curve(history):
    plt.plot(history['f1'], label='Train F1')
    plt.plot(history['val_f1'], label='Val F1')
    plt.xlabel("Epoch")
    plt.ylabel("F1 Score")
    plt.title("Training vs Validation F1 Score")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Legit', 'Phishing'], yticklabels=['Legit', 'Phishing'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
def plot_roc_curve(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.model_selection import train_test_split

# Load your data
df = pd.read_json(r"E:\Phising_detection\dataset\urls\urls.json")
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Create datasets and loaders
train_set = URLDataset(train_df)
val_set = URLDataset(val_df)

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
val_loader = DataLoader(val_set, batch_size=64)

# Train model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CharCNN(vocab_size).to(device)

train_model(model, train_loader, val_loader)

# Train model
history = train_model(model, train_loader, val_loader)

# Plot all metrics
plot_loss(history)
plot_accuracy(history)
plot_f1_curve(history)
plot_confusion_matrix(history['val_y_true'], history['val_y_pred'])
plot_roc_curve(history['val_y_true'], history['val_y_prob'])
