In [1]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import torch

In [2]:
df = pd.read_csv("data/phishing_email.csv")

In [3]:
df.head()

Unnamed: 0,text_combined,label
0,hpl nom may 25 2001 see attached file hplno 52...,0
1,nom actual vols 24 th forwarded sabrae zajac h...,0
2,enron actuals march 30 april 1 201 estimated a...,0
3,hpl nom may 30 2001 see attached file hplno 53...,0
4,hpl nom june 1 2001 see attached file hplno 60...,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82486 entries, 0 to 82485
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_combined  82486 non-null  object
 1   label          82486 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [5]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\slava\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\slava\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
from string import punctuation
def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [re.sub(r'[^a-zA-Z]', '', word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation and word]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [8]:
import copy
processed_df = copy.deepcopy(df)

In [9]:
processed_df["text_combined"] = processed_df["text_combined"].fillna('').apply(preprocess)

In [10]:
df['label'].value_counts()

label
1    42891
0    39595
Name: count, dtype: int64

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
processed_df.head()

Unnamed: 0,text_combined,label
0,hpl nom may see attached file hplno xl hplno xl,0
1,nom actual vols th forwarded sabrae zajac hou ...,0
2,enron actuals march april estimated actuals ma...,0
3,hpl nom may see attached file hplno xl hplno xl,0
4,hpl nom june see attached file hplno xl hplno xl,0


In [13]:
X = processed_df['text_combined']
y = processed_df['label']

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[
        ('text_combined', TfidfVectorizer(stop_words='english', max_features=5000), 'text_combined'),  # TF-IDF for text
    ],
    remainder='passthrough'
)


In [15]:
model = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
torch.cuda.is_available()


True

In [18]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")
torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Running on the GPU
Using device = cuda:0


In [43]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', verbose=10)
print("Cross-Validation scores:", cv_scores)
print("Average Cross-Validation:", np.mean(cv_scores))

param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__solver': ['lbfgs', 'liblinear']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=4, verbose=10)
grid_search.fit(X_train, y_train)
print("Best Parameters from GridSearchCV:", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

accuracy = np.mean(y_pred == y_test)
print("Test Accuracy:", accuracy)

[CV] START .....................................................................
[CV] END ................................ score: (test=0.980) total time=   9.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    9.7s


[CV] END ................................ score: (test=0.978) total time=   8.7s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.982) total time=   9.5s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.979) total time=   9.3s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:   37.4s


[CV] END ................................ score: (test=0.981) total time=   8.8s
Cross-Validation scores: [0.97999654 0.97843782 0.98190163 0.97921718 0.98077589]
Average Cross-Validation: 0.9800658122618635
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters from GridSearchCV: {'classifier__C': 10, 'classifier__solver': 'liblinear'}
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98     11889
           1       0.98      0.99      0.98     12857

    accuracy                           0.98     24746
   macro avg       0.98      0.98      0.98     24746
weighted avg       0.98      0.98      0.98     24746

Confusion Matrix:
 [[11640   249]
 [  167 12690]]
Test Accuracy: 0.9831892022953205


In [45]:
import joblib
joblib.dump(best_model, 'filename.pkl', compress=1)

['filename.pkl']

In [19]:
class RNNNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNNet, self).__init__()

        self.rnn = nn.RNN(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, line_tensor):
        rnn_out, hidden = self.rnn(line_tensor)
        output = self.h2o(hidden[0])
        output = self.softmax(output)

        return output

In [20]:
class TrainingConfig:
    n_hidden = 128
    train_batch_size = 64
    eval_batch_size = 64  # how many images to sample during evaluation
    num_epochs = 10
    learning_rate = 1e-4
    lr_warmup_steps = 500

    seed = 0

In [21]:
import string
import unicodedata

allowed_characters = string.ascii_letters + " .,;'"
n_letters = len(allowed_characters)

In [22]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return allowed_characters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [25]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)

# Vectorize test texts.
X_test = vectorizer.transform(X_test)

In [26]:
from io import open
import glob
import os
import time

from torch.utils.data import Dataset

class PhishDataset(Dataset):

    def __init__(self, data_dir):
        self.data_dir = data_dir #for provenance of the dataset
        self.load_time = time.localtime #for provenance of the dataset
        labels_set = set() #set of all classes

        self.data = []
        self.data_tensors = []
        self.labels = []
        self.labels_tensors = []

        #read all the ``.txt`` files in the specified directory
        text_files = glob.glob(os.path.join(data_dir, '*.csv'))
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            for line in lines:
                self.data.append(line)
                self.data_tensors.append(lineToTensor(line))
                self.labels.append(label)

        #Cache the tensor representation of the labels
        self.labels_uniq = list(labels_set)
        for idx in range(len(self.labels)):
            temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
            self.labels_tensors.append(temp_tensor)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_item = self.data[idx]
        data_label = self.labels[idx]
        data_tensor = self.data_tensors[idx]
        label_tensor = self.labels_tensors[idx]

        return label_tensor, data_tensor, data_label, data_item

In [29]:
data = PhishDataset("data/")
print(f"loaded {len(data)} items of data")
print(f"example = {data[0]}")

ValueError: Iterable over raw text documents expected, string object received.

In [27]:
processed_df

Unnamed: 0,text_combined,label
0,hpl nom may see attached file hplno xl hplno xl,0
1,nom actual vols th forwarded sabrae zajac hou ...,0
2,enron actuals march april estimated actuals ma...,0
3,hpl nom may see attached file hplno xl hplno xl,0
4,hpl nom june see attached file hplno xl hplno xl,0
...,...,...
82481,info advantageapartmentscom infoadvantageapart...,1
82482,monkeyorg helpdeskmonkeyorg monkeyorg hi josep...,1
82483,help center infohelpcentercozainfohelpcenterco...,1
82484,metamask infosofamekarcom verify metamask wall...,1


In [83]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(57740,) (24746,) (57740,) (24746,)


In [75]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(57740, 401158) (24746, 401158) (57740,) (24746,)


In [79]:
import scipy
X_train = torch.tensor(X_train.todense()).float()
X_test = torch.tensor(X_test.todense()).float()
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

MemoryError: Unable to allocate 173. GiB for an array with shape (57740, 401158) and data type float64

In [66]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

torch.Size([]) torch.Size([]) torch.Size([57740]) torch.Size([24746])


In [51]:
rnn = RNNNet(n_letters, TrainingConfig.n_hidden, len(processed_df.label.unique()))
rnn

RNNNet(
  (rnn): RNN(57, 128)
  (h2o): Linear(in_features=128, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=1)
)

In [41]:
import time

In [52]:
import random
import numpy as np
train_losses = []
test_losses = []
test_accuracies = []
def train(rnn, X_train, y_train, n_batch_size=TrainingConfig.train_batch_size, n_epoch=TrainingConfig.num_epochs, report_every=50, learning_rate=TrainingConfig.learning_rate, criterion=nn.CrossEntropyLoss()):
    current_loss = 0
    losses = []
    rnn.train()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

    start = time.time()
    print(f"training on data set with n = {len(processed_df['label'])}")

    for iter in range(1, n_epoch + 1):
        rnn.zero_grad()

        output = rnn.forward(X_train)
        loss = criterion(output, y_train)
        train_loss = loss.item()
        train_losses.append(train_loss)
        # optimize parameters
        loss.backward()
        nn.utils.clip_grad_norm_(rnn.parameters(), 3)
        optimizer.step()
        optimizer.zero_grad()

    # Turn off gradients for validation, saves memory and computations
        with torch.no_grad():
            rnn.eval()
            log_ps = rnn(X_test)
            test_loss = criterion(log_ps, y_test)
            test_losses.append(test_loss)

            ps = torch.exp(log_ps)
            top_p, top_class = ps.topk(1, dim=1)
            equals = top_class == y_test.view(*top_class.shape)
            test_accuracy = torch.mean(equals.float())
            test_accuracies.append(test_accuracy)

        if iter % report_every == 0:
            print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {train_losses[-1]}")
        current_loss = 0

    return all_losses



In [53]:
start = time.time()
all_losses = train(rnn, X_train, y_train, learning_rate=0.15, report_every=5)
end = time.time()
print(f"training took {end-start}s")

training on data set with n = 82486


ValueError: RNN: Expected input to be 2D or 3D, got 0D tensor instead