In [9]:
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from tqdm import tqdm
import numpy as np

# --------------------- Preprocessing ---------------------
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.max_length = 3_000_000  # Increase limit to 3 million characters
def clean_text(text):
    text = str(text).encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")
    text = re.sub(r"[\r\n\t]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def preprocess(text):
    text = clean_text(text)
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and len(token) > 1]

# --------------------- Dataset ---------------------
class EmailDataset(Dataset):
    def __init__(self, subjects, bodies, labels, vocab=None, max_len=200):
        self.max_len = max_len
        self.subj_tokens = [preprocess(t) for t in subjects]
        self.body_tokens = [preprocess(t) for t in bodies]
        self.labels = torch.tensor(labels.values, dtype=torch.float)
        if vocab is None:
            self.build_vocab()
        else:
            self.vocab = vocab
        self.subj_encoded = [self.encode(seq) for seq in self.subj_tokens]
        self.body_encoded = [self.encode(seq) for seq in self.body_tokens]

    def build_vocab(self):
        vocab = {"<PAD>": 0, "<UNK>": 1}
        idx = 2
        for tokens in self.subj_tokens + self.body_tokens:
            for t in tokens:
                if t not in vocab:
                    vocab[t] = idx
                    idx += 1
        self.vocab = vocab

    def encode(self, tokens):
        ids = [self.vocab.get(t, self.vocab["<UNK>"]) for t in tokens[:self.max_len]]
        if len(ids) < self.max_len:
            ids += [self.vocab["<PAD>"]] * (self.max_len - len(ids))
        return torch.tensor(ids)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.subj_encoded[idx], self.body_encoded[idx], self.labels[idx]

# --------------------- BiLSTM Model ---------------------
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm_subj = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.lstm_body = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 4, 1)

    def forward(self, subj, body):
        subj_embed = self.embedding(subj)
        body_embed = self.embedding(body)

        _, (subj_h_n, _) = self.lstm_subj(subj_embed)
        _, (body_h_n, _) = self.lstm_body(body_embed)

        subj_h = torch.cat((subj_h_n[-2], subj_h_n[-1]), dim=1)
        body_h = torch.cat((body_h_n[-2], body_h_n[-1]), dim=1)
        h = torch.cat((subj_h, body_h), dim=1)
        h = self.dropout(h)
        return torch.sigmoid(self.fc(h)).squeeze()

# --------------------- Train Function ---------------------
def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10):
    best_model = None
    best_f1 = 0
    history = {"train_loss": [], "val_loss": [], "val_acc": [], "val_f1": []}

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for subj, body, y in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            subj, body, y = subj.to(device), body.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(subj, body)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        history["train_loss"].append(total_loss / len(train_loader))

        model.eval()
        val_loss = 0
        preds, probs, labels = [], [], []
        with torch.no_grad():
            for subj, body, y in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
                subj, body, y = subj.to(device), body.to(device), y.to(device)
                out = model(subj, body)
                loss = criterion(out, y)
                val_loss += loss.item()
                probs.extend(out.cpu().numpy())
                preds.extend((out > 0.5).int().cpu().numpy())
                labels.extend(y.cpu().numpy())

        history["val_loss"].append(val_loss / len(val_loader))
        acc = np.mean(np.array(preds) == np.array(labels))
        f1 = classification_report(labels, preds, output_dict=True)['1']['f1-score']
        history["val_acc"].append(acc)
        history["val_f1"].append(f1)

        print(f"Epoch {epoch+1}: Train Loss={history['train_loss'][-1]:.4f} | Val Loss={val_loss:.4f}, Acc={acc:.4f}, F1={f1:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            best_model = model.state_dict()

        scheduler.step(f1)

    torch.save(best_model, "best_bilstm_email_model.pt")
    return model, history

In [10]:
import csv
# Use a large but safe value
csv.field_size_limit(2**31 - 1)

2147483647

In [11]:
# --------------------- Main ---------------------
if __name__ == '__main__':
    df = pd.read_csv(r"E:\Phising_detection\dataset\emails\TREC_05.csv",engine='python')
    df.dropna(subset=['subject', 'body', 'label'], inplace=True)
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")
    train_ds = EmailDataset(train_df['subject'], train_df['body'], train_df['label'])
    val_ds = EmailDataset(val_df['subject'], val_df['body'], val_df['label'], vocab=train_ds.vocab)
    print(f"Vocabulary size: {len(train_ds.vocab)}")
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=64)
    print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiLSTMClassifier(vocab_size=len(train_ds.vocab)).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2)
    criterion = nn.BCELoss()

    model, history = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device)

    # Plot Loss/Accuracy
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Val Loss')
    plt.title("Loss over Epochs")
    plt.legend(); plt.show()

    plt.plot(history['val_acc'], label='Val Accuracy')
    plt.plot(history['val_f1'], label='Val F1 Score')
    plt.title("Validation Metrics")
    plt.legend(); plt.show()


Training samples: 43164, Validation samples: 10791
Vocabulary size: 128601
Train batches: 675, Val batches: 169


Epoch 1 [Train]:  28%|██▊       | 186/675 [18:02<47:26,  5.82s/it] 


KeyboardInterrupt: 