# 🛡️ Adversarial Retraining with Augmented Data

This notebook fine-tunes the robust DistilBERT phishing classifier using a combined dataset of original and adversarial examples. The goal is to improve resistance to evasion attacks while preserving accuracy.

In [None]:
# ✅ Install necessary libraries
!pip install datasets transformers -q
!pip install textattack -q
!pip install wandb -q

In [None]:
# 📦 Imports and setup
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast

# Load original + adversarial examples
original_df = pd.read_csv('original_dataset.csv')
adv_df = pd.read_csv('adversarial_attack_outputs.csv')  # generated examples from TextAttack

# Combine datasets
combined_df = pd.concat([original_df, adv_df], ignore_index=True)
combined_df['Email Text'] = combined_df['Email Text'].astype(str)
combined_df['label'] = combined_df['label'].astype(int)

hf_dataset = Dataset.from_pandas(combined_df[['Email Text', 'label']])
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
# 🧪 Tokenize and split
def tokenize_batch(batch):
    return tokenizer(batch['Email Text'], padding=True, truncation=True)

tokenized_dataset = hf_dataset.map(tokenize_batch, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['Email Text'])
tokenized_dataset.set_format('torch')

split = tokenized_dataset.train_test_split(test_size=0.3, seed=42)
val_test_split = split['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = split['train']
val_dataset = val_test_split['train']
test_dataset = val_test_split['test']

In [None]:
# ⚖️ Weighted loss Trainer
from transformers import DistilBertForSequenceClassification, TrainingArguments, DataCollatorWithPadding, Trainer
import torch
from torch.nn import CrossEntropyLoss

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
class_weights = torch.tensor([1.5, 1.0]).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# 🧠 TrainingArguments + Training
training_args = TrainingArguments(
    output_dir='./results_adv_retrain',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=200,
    load_best_model_at_end=True,
    fp16=True,
    disable_tqdm=True,
    report_to='wandb'
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

trainer.train()

In [None]:
# 📊 Final Evaluation
from sklearn.metrics import classification_report
preds_output = trainer.predict(test_dataset)
y_pred = preds_output.predictions.argmax(-1)
y_true = [y for y in test_dataset['label']]
print(classification_report(y_true, y_pred))