In [119]:
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [120]:
f = './Phishing_Email.csv'
df = pd.read_csv(f, index_col=0)

In [121]:
df

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email
...,...,...
18646,date a lonely housewife always wanted to date ...,Phishing Email
18647,request submitted : access request for anita ....,Safe Email
18648,"re : important - prc mtg hi dorn & john , as y...",Safe Email
18649,press clippings - letter on californian utilit...,Safe Email


## Data Cleaning

In [147]:
# Get the number of null values in the dataset
df.isnull().sum()

Email Text    16
Email Type     0
dtype: int64

In [148]:
# Drop the na's
clean_df = df.copy()
clean_df.dropna(inplace=True)

In [149]:
# Look for records that contain the text 'empty'
# These records will also be considered missing and dropped as well

len(clean_df.loc[clean_df['Email Text'] == 'empty'])

533

In [150]:
# Select records that do not have the email text 'empty'
clean_df = clean_df[clean_df['Email Text'] != 'empty']

In [151]:
# View the shape after removing nulls
clean_df.shape

(18101, 2)

In [152]:
# Download stopwords from nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/x/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/x/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Bag of Words

In [153]:
# Binary Encode the Email Type
clean_df.loc[:, 'Email Type'] =  clean_df['Email Type'].map({'Phishing Email': 1, 'Safe Email': 0})

In [166]:
# Convert to int64
clean_df['Email Type'] = pd.to_numeric(clean_df['Email Type'])
clean_df['Email Type'].dtype

dtype('int64')

In [167]:
# Setup the function to clean the text

def bow_preprocess_text(text):
    text = re.sub(r'\W', ' ', text) # Remove special characters
    text = re.sub(r'\d+', ' ', text) # Remove numbers
    text = re.sub(r'\s_', ' ', text).strip() # Remove extra spaces
    text = text.lower()
    words = word_tokenize(text) # Tokenization
    words = [word for word in words if word not in stopwords.words('english')] # Remove stop words
    return ' '.join(words)


In [168]:
# Preprocess text
clean_df['Cleaned Text'] = clean_df['Email Text'].apply(bow_preprocess_text)

In [169]:
clean_df

Unnamed: 0,Email Text,Email Type,Cleaned Text
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0,disc uniformitarianism sex lang dick hudson ob...
1,the other side of * galicismos * * galicismo *...,0,side galicismos galicismo spanish term names i...
2,re : equistar deal tickets are you still avail...,0,equistar deal tickets still available assist r...
3,\nHello I am your hot lil horny toy.\n I am...,1,hello hot lil horny toy one dream open minded ...
4,software at incredibly low prices ( 86 % lower...,1,software incredibly low prices lower drapery s...
...,...,...,...
18645,\nRick Moen a Ã©crit:> > I'm confused. I thou...,0,rick moen ã crit confused thought gpl ed money...
18646,date a lonely housewife always wanted to date ...,1,date lonely housewife always wanted date lonel...
18647,request submitted : access request for anita ....,0,request submitted access request anita dupont ...
18648,"re : important - prc mtg hi dorn & john , as y...",0,important prc mtg hi dorn john discovered rece...


In [170]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(clean_df['Cleaned Text'], clean_df['Email Type'], test_size=0.2, random_state=42, stratify=clean_df['Email Type'])

In [171]:
# Convert to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [177]:
# Logistic Regression Model
# Create the LR model
lr_model = LogisticRegression()
# Train the LR Model
lr_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_tfidf)

# Evalute model performance
accuracy = accuracy_score(y_test, y_pred_lr)
print(f'Accuracy: {accuracy:.4f}')
print(f'Classification Report: \n{classification_report(y_test, y_pred_lr)}')

Accuracy: 0.9762
Classification Report: 
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2225
           1       0.97      0.96      0.97      1396

    accuracy                           0.98      3621
   macro avg       0.98      0.97      0.97      3621
weighted avg       0.98      0.98      0.98      3621



In [178]:
feature_names = vectorizer.get_feature_names_out()
top_coefs = lr_model.coef_[0].argsort()[::-1][:20]
print([feature_names[i] for i in top_coefs])

['remove', 'sightings', 'click', 'money', 'free', 'email', 'removed', 'save', 'site', 'reply', 'software', 'offer', 'hello', 'life', 'rolex', 'best', 'viagra', 'quality', 'mobile', 'meds']
