In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import hstack
import joblib

# Load dataset
df_1 = pd.read_csv(r"C:\Users\may\Desktop\dataset\CEAS_08.csv")
df_2 = pd.read_csv(r"C:\Users\may\Desktop\dataset\data.csv")
df_3 = pd.read_csv(r"C:\Users\may\Desktop\dataset\balanced.csv")

# Extract relevant features and target (excluding sender)
df_1 = df_1[['subject', 'body', 'label']]
df_2 = df_2.rename(columns={'Email Text': 'body', 'Email Type': 'label'})[['body', 'label']]
df_3 = df_3.rename(columns={'class': 'label'})[['body', 'label']]

# Convert Email Type to boolean
df_2['label'] = df_2['label'].map({'Safe Email': 0, 'Phishing Email': 1})

# Remove empty values
df_1 = df_1.dropna(subset=['body', 'subject', 'label'])
df_2 = df_2.dropna(subset=['body'])
df_3 = df_3.dropna(subset=['body', 'label'])

# Add missing subject column to df_2 and df_3 (empty strings)
df_2['subject'] = ''
df_3['subject'] = ''

# Combine datasets
df = pd.concat([df_1, df_2, df_3], ignore_index=True)
df = df[df['label'].isin([0, 1])]

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Text Cleaning Function
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'\W', ' ', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply cleaning
df['cleaned_subject'] = df['subject'].apply(clean_text)
df['cleaned_body'] = df['body'].apply(clean_text)

# TF-IDF Vectorization
tfidf_subject = TfidfVectorizer()
tfidf_body = TfidfVectorizer()
X_subject = tfidf_subject.fit_transform(df['cleaned_subject'])
X_body = tfidf_body.fit_transform(df['cleaned_body'])

# Combine features
X_combined = hstack((X_subject, X_body))
y = df['label']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Define base models
nb_model = MultinomialNB(alpha=0.02)
rf_model = RandomForestClassifier(n_estimators=670, random_state=42, n_jobs=-1)
xgb_model = XGBClassifier(n_estimators=980, random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
logreg_model = LogisticRegression(C=9, random_state=42, n_jobs=-1, max_iter=1000)
knn_model = KNeighborsClassifier(n_neighbors=6, n_jobs=-1)

# Create Voting Classifier
ensemble_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('xgb', xgb_model), ('knn', knn_model), ('logreg', logreg_model)],
    voting='soft'
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Bundle model and vectorizers into a single dictionary
pipeline = {
    'ensemble_model': ensemble_model,
    'tfidf_subject': tfidf_subject,
    'tfidf_body': tfidf_body
}

# Save the bundled pipeline
joblib.dump(pipeline, r"C:\Users\may\Desktop\phishing_pipeline.joblib")



[nltk_data] Downloading package stopwords to C:\Users\may/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


['C:\\Users\\may\\Desktop\\phishing_pipeline.joblib']