In [None]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import torch

In [None]:
df = pd.read_csv("data/phishing_email.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
from string import punctuation
def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [re.sub(r'[^a-zA-Z]', '', word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation and word]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
import copy
processed_df = copy.deepcopy(df)

In [None]:
processed_df["text_combined"] = processed_df["text_combined"].fillna('').apply(preprocess)

In [None]:
tokens = []
for line in processed_df["text_combined"]:
    tokens.append(line.strip().split())

flat_tokens = [x for xs in tokens for x in xs]

In [None]:
flat_tokens

In [None]:
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(flat_tokens, specials=["<unk>"])

In [None]:
len(vocab)

In [None]:
df['label'].value_counts()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
processed_df.head()

In [None]:
X = processed_df['text_combined']
y = processed_df['label']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[
        ('text_combined', TfidfVectorizer(stop_words='english', max_features=5000), 'text_combined'),  # TF-IDF for text
    ],
    remainder='passthrough'
)


In [None]:
model = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
torch.cuda.is_available()


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")
torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

In [None]:
# from sklearn.model_selection import cross_val_score, GridSearchCV
# from sklearn.metrics import classification_report, confusion_matrix
# cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', verbose=10)
# print("Cross-Validation scores:", cv_scores)
# print("Average Cross-Validation:", np.mean(cv_scores))

# param_grid = {
#     'classifier__C': [0.1, 1, 10],
#     'classifier__solver': ['lbfgs', 'liblinear']
# }

# grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=4, verbose=10)
# grid_search.fit(X_train, y_train)
# print("Best Parameters from GridSearchCV:", grid_search.best_params_)

# best_model = grid_search.best_estimator_

# y_pred = best_model.predict(X_test)

# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# accuracy = np.mean(y_pred == y_test)
# print("Test Accuracy:", accuracy)

In [None]:
# import joblib
# joblib.dump(best_model, 'filename.pkl', compress=1)

In [None]:
class RNNNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNNet, self).__init__()

        self.rnn = nn.RNN(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, line_tensor):
        rnn_out, hidden = self.rnn(line_tensor)
        output = self.h2o(hidden[0])
        output = self.softmax(output)

        return output

In [None]:
class TrainingConfig:
    n_hidden = 128
    train_batch_size = 64
    eval_batch_size = 64  # how many images to sample during evaluation
    num_epochs = 3
    learning_rate = 1e-4
    lr_warmup_steps = 500

    seed = 0

In [None]:
import string
import unicodedata

allowed_characters = string.ascii_letters + " .,;'"
n_letters = len(allowed_characters)
n_letters

In [None]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return allowed_characters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [None]:
# processed_df.to_csv("processed_data/processed_phish.csv", index=False)

In [None]:
vectorizer = TfidfVectorizer()
# X_train = vectorizer.fit_transform(X_train)

# #Vectorize test texts.
# X_test = vectorizer.transform(X_test)

In [None]:
X_train.shape

In [None]:
X_train[:2]

In [None]:
X_train = vectorizer.fit_transform([X_train[0]])
X_train = X_train.todense()
X_train = torch.tensor(X_train)

In [None]:
X_train.size(-1)

In [None]:
vocab.get_stoi()

In [None]:
from io import open
import glob
import os
import time

from torch.utils.data import Dataset

class PhishDataset(Dataset):

    def __init__(self, data_dir, vocab_size, vocab):
        self.data_dir = data_dir #for provenance of the dataset
        self.load_time = time.localtime #for provenance of the dataset
        labels_set = set() #set of all classes
        self.count = 0
        self.num_workers = 4
        self.data = []
        self.vocab = vocab
        self.vocab_size = vocab_size
        #self.data_tensors = []
        self.labels = []
        #self.labels_tensors = []

        #read all the ``.csv`` files in the specified directory
        text_files = glob.glob(os.path.join(data_dir, '*.csv'))
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            for line in lines:
                self.data.append(line)
                #self.data_tensors.append(lineToTensor(line))
                self.labels.append(label)
                self.count += 1
                print(f"{self.count} lines processed out of {len(lines)}")

        #Cache the tensor representation of the labels
        self.labels_uniq = list(labels_set)
        #for idx in range(len(self.labels)):
        #    temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
        #    self.labels_tensors.append(temp_tensor)

    def __len__(self):
        return len(self.data)
    
    def __transform_data__(self, idx, type):
        if type == "data":
            #data = [self.vocab[token] for token in self.data[idx]]
            data = vectorizer.fit_transform([self.data[idx]])
            data = data.todense()
            data = torch.tensor(data).float()
            return data
        if type == "label":
            label = vectorizer.fit_transform([self.labels[idx]])
            label = label.todense()
            label = torch.tensor(label).float()
            return label
        return None

    def __getitem__(self, idx):
        data_item = self.__transform_data__(idx, "data")
        data_label = self.__transform_data__(idx, "label")
        #data_tensor = self.data_tensors[idx]
        #label_tensor = self.labels_tensors[idx]

        return data_label, data_item

In [None]:
unk_token = '<unk>'
vocab.set_default_index(vocab[unk_token])

In [None]:
data = PhishDataset("processed_data/", len(vocab), vocab)
print(f"loaded {len(data)} items of data")
print(f"example = {data.data}")

In [None]:
train_set, test_set = torch.utils.data.random_split(data, [.85, .15], generator=torch.Generator(device=device).manual_seed(1))

print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}")

In [None]:
train_set[0][1]

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# import scipy
# X_train = torch.tensor(X_train).float()
# X_test = torch.tensor(X_test).float()
# y_train = torch.tensor(y_train.values)
# y_test = torch.tensor(y_test.values)

In [None]:
rnn = RNNNet(len(vocab), TrainingConfig.n_hidden, len(processed_df.label.unique()))
rnn

In [None]:
import time

In [None]:
# import random
# import numpy as np
# train_losses = []
# test_losses = []
# test_accuracies = []
# def train(rnn, X_train, y_train, n_batch_size=TrainingConfig.train_batch_size, n_epoch=TrainingConfig.num_epochs, report_every=50, learning_rate=TrainingConfig.learning_rate, criterion=nn.CrossEntropyLoss()):
#     current_loss = 0
#     losses = []
#     rnn.train()
#     optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

#     start = time.time()
#     print(f"training on data set with n = {len(processed_df['label'])}")

#     for iter in range(1, n_epoch + 1):
#         rnn.zero_grad()

#         output = rnn.forward(X_train)
#         loss = criterion(output, y_train)
#         train_loss = loss.item()
#         train_losses.append(train_loss)
#         # optimize parameters
#         loss.backward()
#         nn.utils.clip_grad_norm_(rnn.parameters(), 3)
#         optimizer.step()
#         optimizer.zero_grad()

#     # Turn off gradients for validation, saves memory and computations
#         with torch.no_grad():
#             rnn.eval()
#             log_ps = rnn(X_test)
#             test_loss = criterion(log_ps, y_test)
#             test_losses.append(test_loss)

#             ps = torch.exp(log_ps)
#             top_p, top_class = ps.topk(1, dim=1)
#             equals = top_class == y_test.view(*top_class.shape)
#             test_accuracy = torch.mean(equals.float())
#             test_accuracies.append(test_accuracy)

#         if iter % report_every == 0:
#             print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {train_losses[-1]}")
#         current_loss = 0

#     return all_losses



In [None]:
import random
import numpy as np

def train(rnn, training_data, n_epoch=TrainingConfig.num_epochs, n_batch_size=TrainingConfig.train_batch_size, report_every = 50, learning_rate=TrainingConfig.learning_rate, criterion = nn.CrossEntropyLoss()):
    """
    Learn on a batch of training_data for a specified number of iterations and reporting thresholds
    """
    # Keep track of losses for plotting
    current_loss = 0
    all_losses = []
    rnn.train()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

    start = time.time()
    print(f"training on data set with n = {len(training_data)}")

    for iter in range(1, n_epoch + 1):
        rnn.zero_grad() # clear the gradients

        # create some minibatches
        # we cannot use dataloaders because each of our names is a different length
        batches = list(range(len(training_data)))
        random.shuffle(batches)
        batches = np.array_split(batches, len(batches) //n_batch_size )

        for idx, batch in enumerate(batches):
            batch_loss = 0
            for i in batch: #for each example in this batch
                (label, text) = training_data[i]
                output = rnn.forward(text)
                loss = criterion(output, label)
                batch_loss += loss

            # optimize parameters
            batch_loss.backward()
            nn.utils.clip_grad_norm_(rnn.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()

            current_loss += batch_loss.item() / len(batch)

        all_losses.append(current_loss / len(batches) )
        if iter % report_every == 0:
            print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}")
        current_loss = 0

    return all_losses

In [None]:
start = time.time()
all_losses = train(rnn, train_set, report_every=5)
end = time.time()
print(f"training took {end-start}s")