In [2]:
import pandas as pd 
import numpy as np 

### Loading data

In [4]:
df = pd.read_csv("Phishing_Email.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [5]:
df.columns

Index(['Unnamed: 0', 'Email Text', 'Email Type'], dtype='object')

In [7]:
df = df.drop(columns=['Unnamed: 0'])

In [8]:
df.head()

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email


### Preprocessing the text

In [None]:
#checking the 'Email Text' type to understand how we should process this field
df['Email Text'].dtype

In [14]:
df.shape

(18650, 2)

In [19]:
# Drop rows with missing email text (if any)
df.dropna(subset=['Email Text'], inplace=True)

In [21]:
df.shape

(18634, 2)

In [22]:
# Convert the email text column to a list of strings
email_text_list = df['Email Text'].tolist()

In [23]:
import re

# Preprocessing
def preprocess_text(text):
    # Remove newlines and excessive spaces
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Add more preprocessing steps as needed (e.g., removing special characters)

    return text

# Preprocess the email text list
email_text_list = [preprocess_text(text) for text in email_text_list]

In [24]:
# Filter out any empty strings from the list
email_text_list = list(filter(None, email_text_list))

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

# Feature Extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(email_text_list)

In [26]:
from sklearn.preprocessing import LabelEncoder
# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df.loc[df['Email Text'].notnull(), 'Email Type'])


### Machine Learning Models

In [27]:
from sklearn.model_selection import train_test_split
# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.9694123960289778
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96      1518
           1       0.99      0.96      0.97      2209

    accuracy                           0.97      3727
   macro avg       0.97      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

#### Random Forest Classifier

In [30]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
report_rf = classification_report(y_test, y_pred_rf)
print("Random Forest Classification Report:\n", report_rf)

Random Forest Accuracy: 0.9608264019318487
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      1518
           1       0.97      0.97      0.97      2209

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



#### Support Vector Classifier

In [31]:
# Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)
report_svm = classification_report(y_test, y_pred_svm)
print("SVM Classification Report:\n", report_svm)

SVM Accuracy: 0.598068151328146
SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.01      0.03      1518
           1       0.60      1.00      0.75      2209

    accuracy                           0.60      3727
   macro avg       0.80      0.51      0.39      3727
weighted avg       0.76      0.60      0.45      3727



#### Gradient Boosting Classifier

In [32]:
# Gradient Boosting
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy:", accuracy_gb)
report_gb = classification_report(y_test, y_pred_gb)
print("Gradient Boosting Classification Report:\n", report_gb)

Gradient Boosting Accuracy: 0.9275556748054735
Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.91      1518
           1       0.95      0.93      0.94      2209

    accuracy                           0.93      3727
   macro avg       0.92      0.93      0.93      3727
weighted avg       0.93      0.93      0.93      3727

