In [1]:
# ! pip install -q mlflow nlp

## Importing libraries, loading and transforming data¶


In [2]:
#imports
import pandas as pd
import gc
import re
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling, \
pipeline, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from nlp import Dataset
from imblearn.over_sampling import RandomOverSampler
import datasets
from transformers import pipeline
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# set parameters
train_fraction = 0.8 # fraction of a dataset used for training (the rest used for validation)
num_train_epochs = 3 # epochs to train
batch_size = 16 # batch size for training and validation
warmup_steps = 50
weight_decay = 0.02
BERT_MODEL = "distilbert-base-cased"
output_dir = "./phishing-email-detection"

In [4]:
df= pd.read_csv("../dataset/Phishing_Email.csv").drop(['Unnamed: 0'], axis=1).drop_duplicates()
print(df.shape)
df.sample(5).T

(17539, 2)


Unnamed: 0,2060,3620,16134,12566,2770
Email Text,re : cancelflights for next week - i ' m stayi...,URL: http://jeremy.zawodny.com/blog/archives/0...,"check this out , it 's worth a look . wayward ...",lockdown your files message loading image not ...,re : counties / meter types for gemc and midco...
Email Type,Safe Email,Safe Email,Phishing Email,Phishing Email,Safe Email


In [5]:
# create target
df['label'] = (df['Email Type']=="Phishing Email").astype(int)
df['title'] = df['Email Text']
df = df[['title', 'label']]

print(df.shape)
df.sample(20)

(17539, 2)


Unnamed: 0,title,label
2033,cpe certificates for derivatives courses howdy...,0
17822,get me thru july newsletter the get me thru ne...,1
3727,entex contact list 1 / 12 / 00 per request of ...,0
7862,encounter article - shalesh ganjoo as a follow...,0
13877,feminist conference : call for papers we are p...,0
6807,"URL: http://www.newsisfree.com/click/-6,857277...",0
17047,fancy a flutter ? here 's a tip ! ! = 20 we ti...,1
7051,"\r\ncreditfixThank You,Your email address was ...",1
14182,paliourg udtih 7 wcwknoanopkt good morning pal...,1
13664,finest online pills here casbah befitting cofa...,1


In [6]:
df.dropna(inplace=True)


In [7]:
df['label'].mean()


0.37393089291823467

In [8]:
# random oversampling of minority class - not needed because of adding weights to minority class
y = df[['label']]
df = df.drop(['label'], axis=1)
ros = RandomOverSampler(random_state=83)
df, y_resampled = ros.fit_resample(df, y)
del y
df['label'] = y_resampled
print(df.shape)
del y_resampled
gc.collect()

(21960, 2)


0

In [9]:
medium_dataset = Dataset.from_pandas(df)


In [10]:
del df
gc.collect()

0

In [11]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, use_fast=True, low_cpu_mem_usage=False)



In [12]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return tokenizer(examples["title"], truncation=True)

medium_dataset = medium_dataset.map(preprocess_function, batched=True)

100%|██████████| 22/22 [03:41<00:00, 10.06s/it]


In [13]:
medium_dataset = medium_dataset.train_test_split(test_size=1-train_fraction)
medium_dataset


100%|██████████| 18/18 [01:16<00:00,  4.25s/it]
100%|██████████| 5/5 [00:19<00:00,  3.88s/it]


{'train': Dataset(features: {'title': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 17568),
 'test': Dataset(features: {'title': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 4392)}

In [14]:
medium_dataset.remove_column_("title")  # remove the text column because we don't need to keep it in memory anymore
# this is not required but speeds things up a bit

In [15]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
tokenizer.decode(medium_dataset['train'][0]['input_ids'])


"[CLS] shipping confirmation, tracking number : soyyo 82136036993 ufpj order your medication online to save time, save money, and avoid embarassement. no prior perscription is required! packages are shipped descreetly for your privacy. order perscription medications with no doctor waiting room delays the illiterate of the 21 st century will not be those who cannot read and write, but those who cannot learn, unlearn, and relearn. alvin toffler i don't practice reading once a week. lots of times you have to pretend to join a parade in which you're not really interested in order to get where you're going. - christopher darlington morley ( 1890 - 1957 ) the parent arrived back on the scene. she gave me a tape by dr. laura meyers from ucla. i listened to that tape eight times. i listened over and over and heard the same thing again and again. ms. meyers said,'these kids may need to hear a word many times ( perhaps 72 times ) before they ever say a word. a computer can be patient and say it 

## Loading and training model

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False # Whether the model returns all hidden-states.
)

model.config.id2label = {0: 'SAVE EMAIL', 1: 'PHISHING EMAIL'}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
print(model.num_parameters(only_trainable=True)/1e6)


65.783042


In [19]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [20]:
training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir='./logs',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    logging_first_step=True,
    load_best_model_at_end=True,
    logging_steps=1,
    evaluation_strategy='epoch',
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    eval_steps=1,
    save_strategy='epoch',
    report_to="mlflow",  # log to mlflow
)

# Define the trainer: 
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=medium_dataset['train'],
    eval_dataset=medium_dataset['test'],
    data_collator=data_collator
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
# Get initial metrics
trainer.evaluate()

In [None]:
trainer.train()


In [None]:
trainer.evaluate()


# Saving the model and checking its performance with a sample title

In [None]:
trainer.save_model()


In [None]:
tokenizer.save_vocabulary(save_directory=output_dir)


In [None]:
# make a classification pipeline
pipe = pipeline("text-classification", output_dir, tokenizer=BERT_MODEL)
sample_title = '''Why do employees leave companies — analysis of IBM employee data'''
pipe(sample_title, top_k=None)

In [None]:
# use the upper cased version
sample_title2 = sample_title.upper()
pipe(sample_title2, top_k=None)

In [None]:
# more refined version
sample_title3 = '''Why do employees leave companies?'''
pipe(sample_title3, top_k=None)

In [None]:
# finally, save the model to Huggingface
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import create_repo
create_repo("dima806/phishing-email-detection", repo_type="model")

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path=output_dir,
    path_in_repo = ".",
    repo_id="dima806/phishing-email-detection",
    repo_type="model"
)