In [216]:
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [182]:
f = './Phishing_Email.csv'
df = pd.read_csv(f, index_col=0)

In [183]:
df

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email
...,...,...
18646,date a lonely housewife always wanted to date ...,Phishing Email
18647,request submitted : access request for anita ....,Safe Email
18648,"re : important - prc mtg hi dorn & john , as y...",Safe Email
18649,press clippings - letter on californian utilit...,Safe Email


## Data Cleaning

In [184]:
# Get the number of null values in the dataset
df.isnull().sum()

Email Text    16
Email Type     0
dtype: int64

In [185]:
# Drop the na's
clean_df = df.copy()
clean_df.dropna(inplace=True)

In [186]:
# Look for records that contain the text 'empty'
# These records will also be considered missing and dropped as well

len(clean_df.loc[clean_df['Email Text'] == 'empty'])

533

In [187]:
# Select records that do not have the email text 'empty'
clean_df = clean_df[clean_df['Email Text'] != 'empty']

In [188]:
# View the shape after removing nulls
clean_df.shape

(18101, 2)

In [189]:
# Download stopwords from nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/x/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/x/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Bag of Words

In [190]:
# Binary Encode the Email Type
clean_df.loc[:, 'Email Type'] =  clean_df['Email Type'].map({'Phishing Email': 1, 'Safe Email': 0})

In [191]:
# Convert to int64
clean_df['Email Type'] = pd.to_numeric(clean_df['Email Type'])
clean_df['Email Type'].dtype

dtype('int64')

In [192]:
# Setup the function to clean the text

def bow_preprocess_text(text):
    text = re.sub(r'\W', ' ', text) # Remove special characters
    text = re.sub(r'\d+', ' ', text) # Remove numbers
    text = re.sub(r'\s_', ' ', text).strip() # Remove extra spaces
    text = text.lower()
    words = word_tokenize(text) # Tokenization
    words = [word for word in words if word not in stopwords.words('english')] # Remove stop words
    return ' '.join(words)


In [193]:
# Preprocess text
clean_df['Cleaned Text'] = clean_df['Email Text'].apply(bow_preprocess_text)

In [194]:
clean_df

Unnamed: 0,Email Text,Email Type,Cleaned Text
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0,disc uniformitarianism sex lang dick hudson ob...
1,the other side of * galicismos * * galicismo *...,0,side galicismos galicismo spanish term names i...
2,re : equistar deal tickets are you still avail...,0,equistar deal tickets still available assist r...
3,\nHello I am your hot lil horny toy.\n I am...,1,hello hot lil horny toy one dream open minded ...
4,software at incredibly low prices ( 86 % lower...,1,software incredibly low prices lower drapery s...
...,...,...,...
18645,\nRick Moen a Ã©crit:> > I'm confused. I thou...,0,rick moen ã crit confused thought gpl ed money...
18646,date a lonely housewife always wanted to date ...,1,date lonely housewife always wanted date lonel...
18647,request submitted : access request for anita ....,0,request submitted access request anita dupont ...
18648,"re : important - prc mtg hi dorn & john , as y...",0,important prc mtg hi dorn john discovered rece...


In [195]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(clean_df['Cleaned Text'], clean_df['Email Type'], test_size=0.2, random_state=42, stratify=clean_df['Email Type'])

In [196]:
# Convert to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


## Logistic Regression Model

In [197]:
# Logistic Regression Model
# Create the LR model
lr_model = LogisticRegression()
# Train the LR Model
lr_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_tfidf)

# Evalute model performance
accuracy = accuracy_score(y_test, y_pred_lr)
print(f'Accuracy: {accuracy:.4f}')
print(f'Classification Report: \n{classification_report(y_test, y_pred_lr)}')

Accuracy: 0.9762
Classification Report: 
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2225
           1       0.97      0.96      0.97      1396

    accuracy                           0.98      3621
   macro avg       0.98      0.97      0.97      3621
weighted avg       0.98      0.98      0.98      3621



In [198]:
feature_names = vectorizer.get_feature_names_out()
top_coefs = lr_model.coef_[0].argsort()[::-1][:20]
print([feature_names[i] for i in top_coefs])

['remove', 'sightings', 'click', 'money', 'free', 'email', 'removed', 'save', 'site', 'reply', 'software', 'offer', 'hello', 'life', 'rolex', 'best', 'viagra', 'quality', 'mobile', 'meds']


## BERT Model

In [199]:
# Load the BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [208]:
def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')


In [212]:
train_encodings = tokenize_function(X_train)
test_encodings = tokenize_function(X_test)

train_labels = torch.tensor(y_train.tolist())
test_labels = torch.tensor(y_test.tolist())

In [213]:
# Create the Bert Model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [215]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [219]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
bert_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [221]:
optimizer = AdamW(bert_model.parameters(), lr=5e-5)

epochs = 3
for epoch in range(epochs):
    bert_model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {total_loss:.4f}')
    

Epoch 1, Loss: 218.3467
Epoch 2, Loss: 110.8521
Epoch 3, Loss: 37.7403


In [223]:
bert_model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

# Print metrics
print(f'Accuracy: {accuracy_score(true_labels, predictions):.4f}')
print(classification_report(true_labels, predictions))

Accuracy: 0.9859
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2225
           1       0.98      0.99      0.98      1396

    accuracy                           0.99      3621
   macro avg       0.98      0.99      0.99      3621
weighted avg       0.99      0.99      0.99      3621

