In [1]:

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


## 🧰 Environment & Dependencies

To ensure this notebook runs as intended, install the following packages with the specified versions (GPU recommended for training):

```bash
transformers==4.39.3
torch==2.2.1
scikit-learn==1.4.1
pandas==2.2.1
numpy==1.26.4
tqdm==4.66.1
```

If using a virtual environment:
```bash
pip install transformers==4.39.3 torch==2.2.1 scikit-learn==1.4.1 pandas==2.2.1 numpy==1.26.4 tqdm==4.66.1
```

Ensure your runtime supports CUDA (e.g., RTX 2060 or higher) for optimal training performance.


# This notebook is meant to train and save the BERT model on the phishing email dataset.
# It should be run once in a GPU environment. The saved model will be used for testing later. I have provided the file of the fine tuned model for testing, as this can be computationally intesive and time consuming


# ## 1. Load Dataset

In [2]:
import torch
torch.cuda.empty_cache()

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

df = pd.read_csv("../data/processed/CEAS_08_feature_engineered.csv")
texts = df["body"].fillna("").tolist()
labels = df["label"].tolist()


# ## 2. Split into Train and Validation Sets

We will split the data into training and testing sets.
- 80% of the emails will be used for training the BERT model.
- 20% of the emails will be used for final evaluation.
Stratified sampling will be used to ensure both classes (phishing and legitimate) are represented proportionally in both sets.


In [4]:
# Perform train/test split
from sklearn.model_selection import train_test_split
# Use a smaller subset for faster experimentation
texts_subset, _, labels_subset, _ = train_test_split(texts, labels, train_size=0.10, random_state=42)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts_subset, labels_subset, test_size=0.2, random_state=42
)


# ## 3. Load Tokenizer and Tokenize Data


In [5]:
from transformers import BertTokenizer

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)




# ## 4. Create PyTorch Dataset


In [6]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT model for sequence classification (binary classification)
import torch

class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} | {"labels": torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

# ## 5. Load BERT Model for Classification


In [7]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# ## 6. Define Training Arguments

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="../models/Bert_finetuned",
    num_train_epochs=1,  # Only 1 epoch
    per_device_train_batch_size=8,  # Reduce batch size
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=50,
)

# ## 7. Train the Model

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

  0%|          | 0/392 [00:00<?, ?it/s]

{'loss': 0.0211, 'grad_norm': 0.0012361473636701703, 'learning_rate': 4.362244897959184e-05, 'epoch': 0.13}
{'loss': 0.0706, 'grad_norm': 0.0011837815400213003, 'learning_rate': 3.724489795918368e-05, 'epoch': 0.26}
{'loss': 0.0009, 'grad_norm': 0.005225313827395439, 'learning_rate': 3.086734693877551e-05, 'epoch': 0.38}
{'loss': 0.0216, 'grad_norm': 0.0013830693205818534, 'learning_rate': 2.448979591836735e-05, 'epoch': 0.51}
{'loss': 0.0167, 'grad_norm': 0.025902891531586647, 'learning_rate': 1.8112244897959187e-05, 'epoch': 0.64}
{'loss': 0.0797, 'grad_norm': 0.001805033884011209, 'learning_rate': 1.1734693877551021e-05, 'epoch': 0.77}
{'loss': 0.0392, 'grad_norm': 0.003276694566011429, 'learning_rate': 5.357142857142857e-06, 'epoch': 0.89}


  0%|          | 0/49 [00:00<?, ?it/s]

{'eval_loss': 0.03711656108498573, 'eval_accuracy': 0.9936143039591315, 'eval_f1': 0.9940688018979834, 'eval_runtime': 31.2241, 'eval_samples_per_second': 25.077, 'eval_steps_per_second': 1.569, 'epoch': 1.0}
{'train_runtime': 2810.9209, 'train_samples_per_second': 1.114, 'train_steps_per_second': 0.139, 'train_loss': 0.03447864892683467, 'epoch': 1.0}


TrainOutput(global_step=392, training_loss=0.03447864892683467, metrics={'train_runtime': 2810.9209, 'train_samples_per_second': 1.114, 'train_steps_per_second': 0.139, 'train_loss': 0.03447864892683467, 'epoch': 1.0})

# ## 8. Save Model and Tokenizer

In [11]:
model.save_pretrained("../models/Bert_finetuned")
tokenizer.save_pretrained("../models/Bert_finetuned")

('../models/Bert_finetuned\\tokenizer_config.json',
 '../models/Bert_finetuned\\special_tokens_map.json',
 '../models/Bert_finetuned\\vocab.txt',
 '../models/Bert_finetuned\\added_tokens.json')

# From here, we will be testing the model we created

## Testing

In [1]:
#1. Load your saved model and tokenizer
from transformers import BertTokenizer, BertForSequenceClassification

model_path = "../models/Bert_finetuned"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

In [2]:
#2. Load and preprocess your test data
#Make sure it matches how the training data was preprocessed (tokenized body field):
import pandas as pd

df = pd.read_csv("../data/processed/CEAS_08_feature_engineered.csv")
test_texts = df["body"].fillna("").tolist()
test_labels = df["label"].tolist()


In [3]:
#3. Tokenize for BERT
import torch

inputs = tokenizer(test_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")


In [8]:
# 4. Run inference
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# Make sure model is on the right device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Wrap inputs into a Dataset
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(next(iter(self.encodings.values())))

test_dataset = BERTDataset(inputs)

# Create DataLoader for batching
test_loader = DataLoader(test_dataset, batch_size=32)

all_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())  # ✅ Use extend to get flat list


  0%|          | 0/1224 [00:00<?, ?it/s]

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [9]:
print("Predictions:", len(all_preds))
print("Labels:     ", len(test_labels))


Predictions: 39139
Labels:      39139


In [10]:
#5. Evaluate results
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:\n", classification_report(test_labels, all_preds))
print("Confusion Matrix:\n", confusion_matrix(test_labels, all_preds))



Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     17312
           1       0.99      0.99      0.99     21827

    accuracy                           0.99     39139
   macro avg       0.99      0.99      0.99     39139
weighted avg       0.99      0.99      0.99     39139

Confusion Matrix:
 [[17164   148]
 [  154 21673]]


## ✅ Model Training Summary & Results

The BERT model was successfully fine-tuned for phishing email detection using the CEAS 2008 dataset. Key details:

- **Base model**: `bert-base-uncased`
- **Training time**: ~45 minutes on RTX 2060
- **Training epochs**: 1
- **Training batch size**: 8
- **Validation strategy**: Evaluated after each epoch using F1-score

### 🔍 Best Checkpoint:  
- Saved in `../models/Bert_finetuned`  
- Automatically selected based on highest F1-score during validation

This notebook produces a fine-tuned model that is ready for evaluation or deployment. The testing also goes on within this notebook
