# 🤗 Fine-Tune BERT on Zefang Liu's Phishing Email Dataset
Dataset: [`zefang-liu/phishing-email-dataset`](https://www.kaggle.com/datasets/zefang-liu/phishing-email-dataset)

**Steps:**
1. Load dataset
2. Preprocess (`Email Text`, `Email Type`)
3. Tokenize using BERT
4. Fine-tune & evaluate

In [None]:
# Install Hugging Face libraries
!pip install transformers datasets accelerate -q

In [None]:
# Step 1: Load the dataset
from datasets import load_dataset
dataset = load_dataset("zefang-liu/phishing-email-dataset")
dataset

In [None]:
# Step 2: Preprocess - map text & label
def preprocess(example):
    return {
        "text": example["Email Text"],
        "label": 1 if example["Email Type"].lower() == "phishing" else 0
    }

processed_dataset = dataset["train"].map(preprocess)
processed_dataset = processed_dataset.remove_columns(["Unnamed: 0", "Email Text", "Email Type"])
processed_dataset

In [None]:
# Step 3: Tokenize
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

In [None]:
# Step 4: Train/Test Split
from datasets import train_test_split
splits = tokenized_dataset.train_test_split(test_size=0.2)
train_ds = splits["train"]
eval_ds = splits["test"]

In [None]:
# Step 5: Define model and training arguments
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
)

In [None]:
# Step 6: Train
trainer.train()

In [None]:
# Step 7: Evaluate
trainer.evaluate()