In [10]:
import pandas as pd
import torch
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset
df_1 = pd.read_csv(r"C:\Users\may\Desktop\dataset\CEAS_08.csv")
#df_2 = pd.read_csv(r"C:\Users\may\Desktop\dataset/data.csv")
#df_3 = pd.read_csv(r"C:\Users\may\Desktop\dataset/balanced.csv")

# Extract relevant features and rename columns for consistency
df_1 = df_1[['sender', 'subject', 'body', 'label']]
#df_2 = df_2.rename(columns={'Email Text': 'body', 'Email Type': 'label'})
#df_3 = df_3.rename(columns={'class': 'label'})

# Convert 'label' column in df_2 from text to numeric (0 = Safe, 1 = Phishing)
#df_2['label'] = df_2['label'].map({'Safe Email': 0, 'Phishing Email': 1})

# Fill missing columns for consistency
#df_2['sender'] = ''
#df_2['subject'] = ''
#df_3['sender'] = ''
#df_3['subject'] = ''

# Combine datasets
#df = pd.concat([df_1, df_2, df_3], ignore_index=True)

# Drop rows with missing essential values
#df = df.dropna(subset=['body', 'label'])

# Ensure label is binary (0 or 1)
df = df_1[df_1['label'].isin([0, 1])]

[nltk_data] Downloading package stopwords to C:\Users\may/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:

# Function to clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)  # Remove stopwords
    return text.strip()

# Apply text cleaning
df['cleaned_subject'] = df['subject'].apply(clean_text)
df['cleaned_body'] = df['body'].apply(clean_text)

# Combine features into a single text input for BERT
df['text'] = df['cleaned_subject'] + " " + df['cleaned_body']

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create Dataset class
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            self.texts[item],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[item], dtype=torch.long)
        }

# Convert dataset into PyTorch format
train_dataset = EmailDataset(X_train, y_train, tokenizer)
test_dataset = EmailDataset(X_test, y_test, tokenizer)

# Load pre-trained BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train model
trainer.train()

# Evaluate model
preds_output = trainer.predict(test_dataset)
y_pred = np.argmax(preds_output.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

KeyError: 'subject'