In [6]:
# ! pip install -q mlflow nlp

## Importing libraries, loading and transforming data¶


In [7]:
#imports
import pandas as pd
import gc
import re
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling, \
pipeline, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from nlp import Dataset
from imblearn.over_sampling import RandomOverSampler
import datasets
from transformers import pipeline
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()

In [8]:
# set parameters
train_fraction = 0.8 # fraction of a dataset used for training (the rest used for validation)
num_train_epochs = 3 # epochs to train
batch_size = 16 # batch size for training and validation
warmup_steps = 50
weight_decay = 0.02
BERT_MODEL = "distilbert-base-cased"
output_dir = "./phishing-email-detection"

In [9]:
df= pd.read_csv("../dataset/Phishing_Email.csv").drop(['Unnamed: 0'], axis=1).drop_duplicates()
print(df.shape)
df.sample(5).T

(17539, 2)


Unnamed: 0,1927,7268,9441,2512,4785
Email Text,how is everyone . get all your pres . cription...,"""Oh my Janitor, boom, boom, boom.""The best pla...",request submitted : access request for darin ....,fast delivery subliminally entice any woman to...,"Yes, I know, dreadful subject. However, becaus..."
Email Type,Phishing Email,Safe Email,Safe Email,Phishing Email,Safe Email


In [10]:
# create target
df['label'] = (df['Email Type']=="Phishing Email").astype(int)
df['title'] = df['Email Text']
df = df[['title', 'label']]

print(df.shape)
df.sample(20)

(17539, 2)


Unnamed: 0,title,label
4608,intelligent industrial automation ( iia ' 99 )...,0
12251,"All,Is it just me or has there been a massive ...",0
8761,"re : mutually agreed upon changes okay larry ,...",0
6797,fw : jeff delainey is recruiting everybody . i...,0
14379,"paliourg photoshop , windows , office . cheapy...",1
12567,organization meeting please rearrange your sch...,0
3827,candy super $ money maker you are receiving th...,1
11002,"On Fri, 2002-08-16 at 04:28, Angles Puglisi wr...",0
18464,re : saturday . here new and good day : i trie...,1
15613,hola hola soloescribo de estados unidos para q...,1


In [11]:
df.dropna(inplace=True)


In [12]:
df['label'].mean()


0.37393089291823467

In [13]:
# random oversampling of minority class - not needed because of adding weights to minority class
y = df[['label']]
df = df.drop(['label'], axis=1)
ros = RandomOverSampler(random_state=83)
df, y_resampled = ros.fit_resample(df, y)
del y
df['label'] = y_resampled
print(df.shape)
del y_resampled
gc.collect()

(21960, 2)


0

In [14]:
medium_dataset = Dataset.from_pandas(df)


In [15]:
del df
gc.collect()

0

In [16]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, use_fast=True, low_cpu_mem_usage=False)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [17]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return tokenizer(examples["title"], truncation=True)

medium_dataset = medium_dataset.map(preprocess_function, batched=True)

100%|██████████| 22/22 [01:14<00:00,  3.40s/it]


In [18]:
medium_dataset = medium_dataset.train_test_split(test_size=1-train_fraction)
medium_dataset


100%|██████████| 18/18 [00:17<00:00,  1.04it/s]
100%|██████████| 5/5 [00:03<00:00,  1.25it/s]


{'train': Dataset(features: {'title': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 17568),
 'test': Dataset(features: {'title': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 4392)}

In [19]:
medium_dataset.remove_column_("title")  # remove the text column because we don't need to keep it in memory anymore
# this is not required but speeds things up a bit

In [20]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
tokenizer.decode(medium_dataset['train'][0]['input_ids'])


'[CLS] I want to thank those involved in making these RPMS available. Thanks guys, thanks Matthias. LanceOn Thu, 2002 - 08 - 29 at 14 : 30, Lance wrote : > Thanks Matthias. Actually I got all four speakers with subwoofer > working in digital out mode with gamixer. > ( http : / / www1. tcnet. ne. jp / fmurata / linux / down / ) > > However switching between analog and digital, I\'m still baffled. As I > have a tuner and cassette deck hooked up to " Line In " on a SBLive! 5. 1, > which is in analog mode. But digital out works great now! > > On Wed, 2002 - 08 - 28 at 23 : 26, Matthias Saou wrote : > > Once upon a time, Lance wrote : > > > > > Ok, I got ALSA installed and there is no static inbetween mp3s like > > > before which is great! My setup is digital 4. 1 but sound is only coming > > > from front 2 speakers and subwoofer, rear speakers there is no sound. > > > Also alsamixer or aumix are unresponsive as well. > > > > Maybe you could find more info or tips on the ALSA page for your 

## Loading and training model

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False # Whether the model returns all hidden-states.
)

model.config.id2label = {0: 'SAVE EMAIL', 1: 'PHISHING EMAIL'}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
print(model.num_parameters(only_trainable=True)/1e6)


65.783042


In [24]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 4.21kB [00:00, 408kB/s]                    


In [25]:
training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir='./logs',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    logging_first_step=True,
    load_best_model_at_end=True,
    logging_steps=1,
    evaluation_strategy='epoch',
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    eval_steps=1,
    save_strategy='epoch',
    report_to="mlflow",  # log to mlflow
)

# Define the trainer: 
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=medium_dataset['train'],
    eval_dataset=medium_dataset['test'],
    data_collator=data_collator
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
# Get initial metrics
trainer.evaluate()

In [None]:
trainer.train()


In [None]:
trainer.evaluate()


# Saving the model and checking its performance with a sample title

In [None]:
trainer.save_model()


In [None]:
tokenizer.save_vocabulary(save_directory=output_dir)


In [None]:
# make a classification pipeline
pipe = pipeline("text-classification", output_dir, tokenizer=BERT_MODEL)
sample_title = '''Why do employees leave companies — analysis of IBM employee data'''
pipe(sample_title, top_k=None)

In [None]:
# use the upper cased version
sample_title2 = sample_title.upper()
pipe(sample_title2, top_k=None)

In [None]:
# more refined version
sample_title3 = '''Why do employees leave companies?'''
pipe(sample_title3, top_k=None)

In [None]:
# finally, save the model to Huggingface
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import create_repo
create_repo("dima806/phishing-email-detection", repo_type="model")

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path=output_dir,
    path_in_repo = ".",
    repo_id="dima806/phishing-email-detection",
    repo_type="model"
)