In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle
from imblearn.over_sampling import SMOTE

In [2]:
# Load your dataset
data = pd.read_csv('Phishing_Email.csv')

In [3]:
print(data['Email Type'].value_counts())

Safe Email        11322
Phishing Email     7328
Name: Email Type, dtype: int64


In [4]:
# Check for and handle NaN values
data.dropna(subset=['Email Text', 'Email Type'], inplace=True)

In [5]:
# Separate features and labels
X = data['Email Text']
y = data['Email Type']  # 'phishing' or 'safe'

In [6]:
y = y.map({'Phishing Email': 1, 'Safe Email': 0})

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [9]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vec, y_train)

In [10]:
# Train the classifier on the resampled data
classifier = LogisticRegression()
classifier.fit(X_train_resampled, y_train_resampled)

LogisticRegression()

In [11]:
# Evaluate the model
y_pred = classifier.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2209
           1       0.93      0.99      0.96      1518

    accuracy                           0.96      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.96      0.96      3727



In [12]:
from sklearn.linear_model import LogisticRegression

# Train the classifier with class weights
classifier = LogisticRegression(class_weight='balanced', random_state=42)
classifier.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test_vec)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2209
           1       0.93      0.99      0.96      1518

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.97      0.97      3727



In [37]:
import numpy as np

# Function to predict if an email is spam or not with adjusted threshold
def is_spam(email_text, threshold=0.5):
    email_vec = vectorizer.transform([email_text])
    proba = classifier.predict_proba(email_vec)[0, 1]
    return proba >= threshold

# Evaluate the model with adjusted threshold
threshold = 0.5 # Adjust threshold as needed
y_pred_proba = classifier.predict_proba(X_test_vec)[:, 1]
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

print(classification_report(y_test, y_pred_adjusted))


              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2209
           1       0.93      0.99      0.96      1518

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.97      0.97      3727



In [38]:
# Save the model and vectorizer using pickle
with open('email_classifier.pkl', 'wb') as model_file:
    pickle.dump(classifier, model_file)
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)