In [1]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import torch

In [2]:
df = pd.read_csv("data/phishing_email.csv")

In [3]:
df.head()

Unnamed: 0,text_combined,label
0,hpl nom may 25 2001 see attached file hplno 52...,0
1,nom actual vols 24 th forwarded sabrae zajac h...,0
2,enron actuals march 30 april 1 201 estimated a...,0
3,hpl nom may 30 2001 see attached file hplno 53...,0
4,hpl nom june 1 2001 see attached file hplno 60...,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82486 entries, 0 to 82485
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_combined  82486 non-null  object
 1   label          82486 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [5]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\slava\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\slava\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
from string import punctuation
def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [re.sub(r'[^a-zA-Z]', '', word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation and word]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [8]:
import copy
processed_df = copy.deepcopy(df)

In [10]:
#processed_df["text_combined"] = processed_df["text_combined"].fillna('').apply(preprocess)
#processed_df.to_csv("processed_data/processed_df.csv", index=False)
processed_df = pd.read_csv("processed_data/processed_phish.csv")

In [11]:
processed_df["text_combined"]

0          hpl nom may see attached file hplno xl hplno xl
1        nom actual vols th forwarded sabrae zajac hou ...
2        enron actuals march april estimated actuals ma...
3          hpl nom may see attached file hplno xl hplno xl
4         hpl nom june see attached file hplno xl hplno xl
                               ...                        
82481    info advantageapartmentscom infoadvantageapart...
82482    monkeyorg helpdeskmonkeyorg monkeyorg hi josep...
82483    help center infohelpcentercozainfohelpcenterco...
82484    metamask infosofamekarcom verify metamask wall...
82485    fastway infofastwaycozainfofastwaycozainfofast...
Name: text_combined, Length: 82486, dtype: object

In [15]:
tokens = []
for line in processed_df["text_combined"]:
    try:
        tokens.append(str(line).strip().split())
    except Exception as e:
        print(line)

flat_tokens = [x for xs in tokens for x in xs]

In [17]:
flat_tokens

['hpl',
 'nom',
 'may',
 'see',
 'attached',
 'file',
 'hplno',
 'xl',
 'hplno',
 'xl',
 'nom',
 'actual',
 'vols',
 'th',
 'forwarded',
 'sabrae',
 'zajac',
 'hou',
 'ect',
 'pm',
 'enron',
 'capital',
 'trade',
 'resource',
 'corp',
 'eileen',
 'ponton',
 'davilal',
 'txu',
 'com',
 'cstonel',
 'txu',
 'com',
 'mjones',
 'txu',
 'com',
 'hpl',
 'scheduling',
 'enron',
 'com',
 'liz',
 'bellamy',
 'enron',
 'com',
 'szajac',
 'enron',
 'com',
 'cc',
 'subject',
 'nom',
 'actual',
 'vols',
 'th',
 'agree',
 'nomination',
 'forwarded',
 'eileen',
 'ponton',
 'houston',
 'pefs',
 'pec',
 'charlie',
 'stone',
 'eileen',
 'ponton',
 'melissa',
 'jones',
 'com',
 'hpl',
 'scheduling',
 'enron',
 'com',
 'liz',
 'bellamy',
 'enron',
 'com',
 'szajac',
 'enron',
 'com',
 'subject',
 'nom',
 'actual',
 'vols',
 'th',
 'pm',
 'agree',
 'nominated',
 'volume',
 'record',
 'reflect',
 'following',
 'nom',
 'schedule',
 'rate',
 'eff',
 'hr',
 'hour',
 'beginning',
 'hr',
 'rate',
 'eff',
 'hr',
 

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
processed_df.head()

Unnamed: 0,text_combined,label
0,hpl nom may see attached file hplno xl hplno xl,0
1,nom actual vols th forwarded sabrae zajac hou ...,0
2,enron actuals march april estimated actuals ma...,0
3,hpl nom may see attached file hplno xl hplno xl,0
4,hpl nom june see attached file hplno xl hplno xl,0


In [20]:
X = processed_df['text_combined']
y = processed_df['label']

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[
        ('text_combined', TfidfVectorizer(stop_words='english', max_features=5000), 'text_combined'),  # TF-IDF for text
    ],
    remainder='passthrough'
)


In [22]:
model = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
torch.cuda.is_available()


True

In [25]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")
torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Running on the GPU
Using device = cuda:0


In [26]:
# from sklearn.model_selection import cross_val_score, GridSearchCV
# from sklearn.metrics import classification_report, confusion_matrix
# cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', verbose=10)
# print("Cross-Validation scores:", cv_scores)
# print("Average Cross-Validation:", np.mean(cv_scores))

# param_grid = {
#     'classifier__C': [0.1, 1, 10],
#     'classifier__solver': ['lbfgs', 'liblinear']
# }

# grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=4, verbose=10)
# grid_search.fit(X_train, y_train)
# print("Best Parameters from GridSearchCV:", grid_search.best_params_)

# best_model = grid_search.best_estimator_

# y_pred = best_model.predict(X_test)

# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# accuracy = np.mean(y_pred == y_test)
# print("Test Accuracy:", accuracy)

In [27]:
# import joblib
# joblib.dump(best_model, 'filename.pkl', compress=1)

In [28]:
class RNNNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNNet, self).__init__()

        self.rnn = nn.RNN(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, line_tensor):
        rnn_out, hidden = self.rnn(line_tensor)
        output = self.h2o(hidden[0])
        output = self.softmax(output)

        return output

In [29]:
class TrainingConfig:
    n_hidden = 128
    train_batch_size = 64
    eval_batch_size = 64  # how many images to sample during evaluation
    num_epochs = 3
    learning_rate = 1e-4
    lr_warmup_steps = 500

    seed = 0

In [30]:
import string
import unicodedata

allowed_characters = string.ascii_letters + " .,;'"
n_letters = len(allowed_characters)
n_letters

57

In [31]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return allowed_characters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [32]:
# processed_df.to_csv("processed_data/processed_phish.csv", index=False)

In [33]:
vectorizer = TfidfVectorizer()
# X_train = vectorizer.fit_transform(X_train)

# #Vectorize test texts.
# X_test = vectorizer.transform(X_test)

In [34]:
X_train.shape

(57740,)

In [35]:
X_train[:2]

64691    qydlqcwsiacfymissuesapacheorg httpsissuesapach...
14942    cheap online pill chromate obsessive consensus...
Name: text_combined, dtype: object

In [36]:
X_train = vectorizer.fit_transform([X_train[0]])
X_train = X_train.todense()
X_train = torch.tensor(X_train)

In [37]:
X_train.size(-1)

8

In [63]:
from io import open
import glob
import os
import time

from torch.utils.data import Dataset
import torch.nn.functional as F

class PhishDataset(Dataset):

    def __init__(self, data_dir):
        self.data_dir = data_dir #for provenance of the dataset
        self.load_time = time.localtime #for provenance of the dataset
        labels_set = set() #set of all classes
        self.count = 0
        self.num_workers = 4
        self.data = []
        self.target_len = 158587
        #self.vocab = vocab
        #self.vocab_size = vocab_size
        #self.data_tensors = []
        self.labels = []
        #self.labels_tensors = []

        #read all the ``.csv`` files in the specified directory
        text_files = glob.glob(os.path.join(data_dir, '*.csv'))
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label)
            lines = open(filename, encoding='utf-8').read().replace(',', '').split('0' or '1')
            for line in lines:
                self.data.append(line)
                #self.data_tensors.append(lineToTensor(line))
                self.labels.append(label)
                self.count += 1
                print(f"{self.count} lines processed out of {len(lines)}")

        #Cache the tensor representation of the labels
        self.labels_uniq = list(labels_set)
        #for idx in range(len(self.labels)):
        #    temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
        #    self.labels_tensors.append(temp_tensor)

    def __len__(self):
        return len(self.data)

    def letterToIndex(letter):
        return self.vocab.getstoi().find(letter)

    def lineToTensor(line):
        tensor = torch.zeros(len(line), 1, self.vocab_size)
        for li, letter in enumerate(line):
            tensor[li][0][letterToIndex(letter)] = 1
        return tensor
    
    def __transform_data__(self, idx, type):
        if type == "data":
            #data = [self.vocab[token] for token in self.data[idx]]
            data = vectorizer.fit_transform([self.data[idx]])
            data = data.todense()
            data = torch.tensor(data).float()
            data = F.pad(data, (self.target_len - data.size(1), 0))
            #data = self.lineToTensor(self.data[idx])
            return data
        if type == "label":
            label = vectorizer.fit_transform([self.labels[idx]])
            label = label.todense()
            label = torch.tensor(label).float()
            #label = self.lineToTensor(self.labels[idx])
            return label
        return None

    def __getitem__(self, idx):
        data_item = self.__transform_data__(idx, "data")
        data_label = self.__transform_data__(idx, "label")
        #data_tensor = self.data_tensors[idx]
        #label_tensor = self.labels_tensors[idx]

        return data_item, data_label

In [64]:
# unk_token = '<unk>'
# vocab.set_default_index(vocab[unk_token])

In [65]:
data = PhishDataset("processed_data/")
print(f"loaded {len(data)} items of data")
print(f"example = {data[39595]}")

1 lines processed out of 39596
2 lines processed out of 39596
3 lines processed out of 39596
4 lines processed out of 39596
5 lines processed out of 39596
6 lines processed out of 39596
7 lines processed out of 39596
8 lines processed out of 39596
9 lines processed out of 39596
10 lines processed out of 39596
11 lines processed out of 39596
12 lines processed out of 39596
13 lines processed out of 39596
14 lines processed out of 39596
15 lines processed out of 39596
16 lines processed out of 39596
17 lines processed out of 39596
18 lines processed out of 39596
19 lines processed out of 39596
20 lines processed out of 39596
21 lines processed out of 39596
22 lines processed out of 39596
23 lines processed out of 39596
24 lines processed out of 39596
25 lines processed out of 39596
26 lines processed out of 39596
27 lines processed out of 39596
28 lines processed out of 39596
29 lines processed out of 39596
30 lines processed out of 39596
31 lines processed out of 39596
32 lines processe

In [41]:
train_set, test_set = torch.utils.data.random_split(data, [.85, .15], generator=torch.Generator(device=device).manual_seed(1))

print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}")

train examples = 33657, validation examples = 5939


In [44]:
max = 0
f = open("processed_data/processed_phish.csv", encoding='utf-8').read().strip().replace(',', '').split('0' or '1')
for line in range(len(f) - 1):
    length = len(f[line])
    if length > max:
        max = length
        idx = line
    print(f"on line {line}")
print(max, line)

on line 0
on line 1
on line 2
on line 3
on line 4
on line 5
on line 6
on line 7
on line 8
on line 9
on line 10
on line 11
on line 12
on line 13
on line 14
on line 15
on line 16
on line 17
on line 18
on line 19
on line 20
on line 21
on line 22
on line 23
on line 24
on line 25
on line 26
on line 27
on line 28
on line 29
on line 30
on line 31
on line 32
on line 33
on line 34
on line 35
on line 36
on line 37
on line 38
on line 39
on line 40
on line 41
on line 42
on line 43
on line 44
on line 45
on line 46
on line 47
on line 48
on line 49
on line 50
on line 51
on line 52
on line 53
on line 54
on line 55
on line 56
on line 57
on line 58
on line 59
on line 60
on line 61
on line 62
on line 63
on line 64
on line 65
on line 66
on line 67
on line 68
on line 69
on line 70
on line 71
on line 72
on line 73
on line 74
on line 75
on line 76
on line 77
on line 78
on line 79
on line 80
on line 81
on line 82
on line 83
on line 84
on line 85
on line 86
on line 87
on line 88
on line 89
on line 90
on line 9

In [70]:
max = 0
fl = open("processed_data/processed_phish.csv", encoding='utf-8').read().replace(',', '').split('0' or '1')
for line in range(len(fl) - 1):
    
    print(f"on line {line}")
print(max, line)

on line 0
on line 1
on line 2
on line 3
on line 4
on line 5
on line 6
on line 7
on line 8
on line 9
on line 10
on line 11
on line 12
on line 13
on line 14
on line 15
on line 16
on line 17
on line 18
on line 19
on line 20
on line 21
on line 22
on line 23
on line 24
on line 25
on line 26
on line 27
on line 28
on line 29
on line 30
on line 31
on line 32
on line 33
on line 34
on line 35
on line 36
on line 37
on line 38
on line 39
on line 40
on line 41
on line 42
on line 43
on line 44
on line 45
on line 46
on line 47
on line 48
on line 49
on line 50
on line 51
on line 52
on line 53
on line 54
on line 55
on line 56
on line 57
on line 58
on line 59
on line 60
on line 61
on line 62
on line 63
on line 64
on line 65
on line 66
on line 67
on line 68
on line 69
on line 70
on line 71
on line 72
on line 73
on line 74
on line 75
on line 76
on line 77
on line 78
on line 79
on line 80
on line 81
on line 82
on line 83
on line 84
on line 85
on line 86
on line 87
on line 88
on line 89
on line 90
on line 9

In [45]:
maxline = vectorizer.fit_transform([f[39595]])
maxline = maxline.todense()
maxline = torch.tensor(maxline).float()
maxline.size()

torch.Size([1, 158621])

In [58]:
print(f[39594])


mailmanownerlistssourceforgenet official mailing sourceforgenet receiving message previously subscribed one optin mailing list managed sourceforgenet project hosted sourceforgenet message monthly subscription reminder automatically generated mailman mailing list management software used sourceforgenet httpwwwlistorg reply email instruction provided unsubscribing list obtaining support support provided email unsubscribe use web browser access list management url list wish unsubscribe list management url list may found bottom email already know list management password click email password button list password differ listtolist different password use sourceforgenet site account list management page enter list password see step know list password unsubscribing box found upper righthand corner list management page entering password subscription clicking unsubscribe button unsubscribed list immediately unsubscribe one list must access management page using appropriate url listed bottom ema

In [47]:
len(open("processed_data/processed_phish.csv", encoding='utf-8').read().strip().replace(',', '').split('0' or '1'))

39596

In [48]:
processed_df["text_combined"][0]

'hpl nom may see attached file hplno xl hplno xl'

In [49]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

torch.Size([1, 8]) (24746,) (57740,) (24746,)


In [50]:
# import scipy
# X_train = torch.tensor(X_train).float()
# X_test = torch.tensor(X_test).float()
# y_train = torch.tensor(y_train.values)
# y_test = torch.tensor(y_test.values)

In [51]:
rnn = RNNNet(158587, TrainingConfig.n_hidden, len(processed_df.label.unique()))
rnn

RNNNet(
  (rnn): RNN(158587, 128)
  (h2o): Linear(in_features=128, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=1)
)

In [52]:
import time

In [53]:
# import random
# import numpy as np
# train_losses = []
# test_losses = []
# test_accuracies = []
# def train(rnn, X_train, y_train, n_batch_size=TrainingConfig.train_batch_size, n_epoch=TrainingConfig.num_epochs, report_every=50, learning_rate=TrainingConfig.learning_rate, criterion=nn.CrossEntropyLoss()):
#     current_loss = 0
#     losses = []
#     rnn.train()
#     optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

#     start = time.time()
#     print(f"training on data set with n = {len(processed_df['label'])}")

#     for iter in range(1, n_epoch + 1):
#         rnn.zero_grad()

#         output = rnn.forward(X_train)
#         loss = criterion(output, y_train)
#         train_loss = loss.item()
#         train_losses.append(train_loss)
#         # optimize parameters
#         loss.backward()
#         nn.utils.clip_grad_norm_(rnn.parameters(), 3)
#         optimizer.step()
#         optimizer.zero_grad()

#     # Turn off gradients for validation, saves memory and computations
#         with torch.no_grad():
#             rnn.eval()
#             log_ps = rnn(X_test)
#             test_loss = criterion(log_ps, y_test)
#             test_losses.append(test_loss)

#             ps = torch.exp(log_ps)
#             top_p, top_class = ps.topk(1, dim=1)
#             equals = top_class == y_test.view(*top_class.shape)
#             test_accuracy = torch.mean(equals.float())
#             test_accuracies.append(test_accuracy)

#         if iter % report_every == 0:
#             print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {train_losses[-1]}")
#         current_loss = 0

#     return all_losses



In [54]:
import random
import numpy as np

def train(rnn, training_data, n_epoch=TrainingConfig.num_epochs, n_batch_size=TrainingConfig.train_batch_size, report_every = 50, learning_rate=TrainingConfig.learning_rate, criterion = nn.CrossEntropyLoss()):
    """
    Learn on a batch of training_data for a specified number of iterations and reporting thresholds
    """
    # Keep track of losses for plotting
    current_loss = 0
    all_losses = []
    rnn.train()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

    start = time.time()
    print(f"training on data set with n = {len(training_data)}")

    for iter in range(1, n_epoch + 1):
        rnn.zero_grad() # clear the gradients

        # create some minibatches
        # we cannot use dataloaders because each of our names is a different length
        batches = list(range(len(training_data)))
        random.shuffle(batches)
        batches = np.array_split(batches, len(batches) //n_batch_size )

        for idx, batch in enumerate(batches):
            batch_loss = 0
            for i in batch: #for each example in this batch
                (label, text) = training_data[i]
                output = rnn.forward(text)
                loss = criterion(output, label)
                batch_loss += loss

            # optimize parameters
            batch_loss.backward()
            nn.utils.clip_grad_norm_(rnn.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()

            current_loss += batch_loss.item() / len(batch)

        all_losses.append(current_loss / len(batches) )
        if iter % report_every == 0:
            print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}")
        current_loss = 0

    return all_losses

In [55]:
start = time.time()
all_losses = train(rnn, train_set, report_every=5)
end = time.time()
print(f"training took {end-start}s")

training on data set with n = 33657


RuntimeError: input.size(-1) must be equal to input_size. Expected 158587, got 1