# Is it a Phish?

## Motivation and intro

## Importing necessary libraries

In [1]:
# Basic libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pickle
import random
sns.set()


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Classification metrics and dataset division
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [3]:

#Scikit-learn ML models
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


In [4]:

#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

Using TensorFlow backend.


In [5]:

# NLP 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re

In [6]:

#Feature engineering from scikit-learn for text based columns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

## Function to train the model

In [7]:

def train_model(filename):
    
    #reading csv file    
    phish_data = pd.read_csv(filename)

    #Extracting url data
    clean_url = phish_data["URL"]

    #Addtional URL based features

    #1. Length of url

    len_url = []

    for ur in clean_url:
        len_url.append(len(ur))

    phish_data['URL_length'] = pd.Series(len_url)

    #2. Number of slashes

    len_slashes = []

    for ur in clean_url:
        len_slashes.append(ur.count('/') )

    phish_data['URL_slashes'] = pd.Series(len_slashes)

    #3 Number of dots


    len_dots = []

    for ur in clean_url:
        len_dots.append(ur.count('.') )

    phish_data['URL_dots'] = pd.Series(len_dots)


    len_host = []

    start = '://'
    end = '/'

    for ur in clean_url:
        temp = ur[ur.find(start)+2*len(start) + 2: ur.rfind(end)]
        temp = temp.replace('/','.')
        temp = temp.replace('-','.')
        len_host.append(len(temp.split('.')))

    phish_data['URL_host'] = pd.Series(len_host)
    
    print('Feature engineering completed')    
    
    #Numerical features for ML models

    num_data = phish_data[['create_age(months)', 'expiry_age(months)', 'update_age(days)', 'URL_length', 'URL_slashes', 'URL_dots', 'URL_host']].values
    num_lab = phish_data["Label"].values

    #Scaling input features
    sscaler = StandardScaler()
    num_data_scaled = sscaler.fit_transform(num_data)
    num_data = num_data_scaled

    #random seed
    random_seed_val = random.randint(0,50)
    
    print('random seed value: ', random_seed_val)

    #Defining ML models 
    names = ["Log-Reg", "Nearest Neighbors",
             "Decision Tree", "Random Forest", "AdaBoost"]

    classifiers = [
        LogisticRegression(),
        KNeighborsClassifier(5),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, criterion = 'gini', max_features = 'log2', n_estimators = 50),
        AdaBoostClassifier()]
    
    #Model 1
    
    #Train Test split
    X_train, X_test, y_train, y_test = train_test_split(num_data_scaled, num_lab, test_size = 0.2, random_state = random_seed_val)

    # Variables to save predicted probabilities
    y_pred_mat_num = np.zeros((len(X_test), len(names)))
    y_pred_mat_num_train = np.zeros((len(X_train), len(names)))

    #Saving F1-scores for training and test datastes
    f1_vals = []
    f1_vals_train = []

    i = 0

    models_1 = []

    for name, clf in zip(names, classifiers):

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y_hat = clf.predict(X_test)

        y_pred_mat_num[:,i] = clf.predict_proba(X_test)[:,1]
        y_pred_mat_num_train[:,i] = clf.predict_proba(X_train)[:,1]

        f1_vals.append(f1_score(y_test, y_hat))
        f1_vals_train.append(f1_score(y_train, clf.predict(X_train)))

        models_1.append(clf)

        i += 1

    #Saving the best model 1 based on F1-scores

    y_num = y_pred_mat_num[:,np.argmax(f1_vals)]
    y_num_train = y_pred_mat_num_train[:,np.argmax(f1_vals_train)]
    best_model_1 = models_1[np.argmax(f1_vals)]

    print('model 1 training performance')
    print(np.argmax(f1_vals_train))
    print(np.max(f1_vals))
    
    model_name_1 = 'best_model_1.sav'
    pickle.dump(best_model_1, open(model_name_1, 'wb'))
    print('Model 1 saved')

    # Model 2 - TF-IDF with original url and ML models

    X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(clean_url, num_lab, test_size = 0.2, random_state = random_seed_val)

    # Variables to save predict probabilities for model 2

    y_pred_mat_text = np.zeros((len(X_test_text), len(names)))
    y_pred_mat_text_train = np.zeros((len(X_train_text), len(names)))


    # Saving F1-scores for model 2
    f1_vals_text = []
    f1_vals_text_train = []

    i = 0

    models_2 = []

    for name, clf in zip(names, classifiers):

        #NLP pipeline with vectorization and TFIDF
        classifier = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', clf)])

        classifier.fit(X_train_text, y_train_text)
        y_hat_text = classifier.predict(X_test_text)

        y_pred_mat_text[:,i] = classifier.predict_proba(X_test_text)[:,1]
        y_pred_mat_text_train[:,i] = classifier.predict_proba(X_train_text)[:,1]

        f1_vals_text.append(f1_score(y_test_text, y_hat_text))
        f1_vals_text_train.append(f1_score(y_train_text, classifier.predict(X_train_text)))
        models_2.append(classifier)
        i += 1

    #Saving best pipeline parameters(model) for model 2
    y_text = y_pred_mat_text[:,np.argmax(f1_vals_text)]
    y_text_train = y_pred_mat_text_train[:,np.argmax(f1_vals_text_train)]
    best_model_2 = models_2[np.argmax(f1_vals_text)]

    model_name_2 = 'best_model_2.sav'
    pickle.dump(best_model_2, open(model_name_2, 'wb'))
    print('Model 2 saved')
    
    
    # Model - 3

    #BOW features from URL for LSTM (deep learning model) with Keras
    total_word_count = 5000
    tokenizer = Tokenizer(num_words=total_word_count)
    tokenizer.fit_on_texts(clean_url)
    seq_length = 5 #Number of items in each sequence
    sequences = tokenizer.texts_to_sequences(clean_url)
    data = pad_sequences(sequences, maxlen=seq_length)

    #Scaling the features
    sscaler = StandardScaler()
    num_data_scaled = sscaler.fit_transform(num_data)
    num_data = num_data_scaled

    #training and testing sets for LSTM model
    x_train, x_test, y_train, y_test = train_test_split(data, num_lab, test_size=0.2, random_state=random_seed_val)

    #Defining Keras model
    model = Sequential()
    model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
    model.add(LSTM(seq_length, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model_name_3 = 'lstm_url.h5'
    ## Fitting the LSTM model
    callbacks = [EarlyStopping(monitor='val_loss', patience=3),
                 ModelCheckpoint(filepath= model_name_3, monitor='val_loss', save_best_only=True)]

    model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 0)
    
    print('Model 3 saved')
    
    best_model = load_model('lstm_url.h5')

    #Predicting classification probabilities for LSTM model
    y_pred_lstm = best_model.predict_classes(x_test)
    y_pred_lstm_score = best_model.predict_proba(x_test)
    y_pred_lstm_score_train = best_model.predict_proba(x_train)

    #Reshaping to combine with probability scores from other models
    y_pred_lstm_score = y_pred_lstm_score.reshape(len(y_pred_lstm_score),)
    y_pred_lstm_score_train = y_pred_lstm_score_train.reshape(len(y_pred_lstm_score_train),)

    #Combining prediction probabilities

    #simple addition
    fusion = y_num + y_text + y_pred_lstm_score
    fusion_train = y_num_train + y_text_train + y_pred_lstm_score_train

    #Experiments for choosing the threshold
    f1_fusion_train = []
    thr_range = np.arange(0.5, 3, 0.05)

    for thr in thr_range:
        fusion_pred_train = []

        for val in fusion_train:
            if val > thr:
                fusion_pred_train.append(1)
            else:
                fusion_pred_train.append(0)


        fusion_pred_train = np.array(fusion_pred_train)

        f1_fusion_train.append(f1_score(y_train_text, fusion_pred_train))

    best_thr = thr_range[np.argmax(f1_fusion_train)]
    
    #Returns best_threshold, names of model files to load during testing
    
    print('Combined model training complete')
    
    
    return best_thr, model_name_1, model_name_2, model_name_3, sscaler, tokenizer

In [8]:
def test_model(train_file_name, test_file_name):
    
    #read csv files  

    train_data = pd.read_csv(train_file_name)
    phish_data_1 = pd.read_csv(test_file_name)

    best_thr, model_name_1, model_name_2, model_name_3, sscaler, tokenizer = train_model(train_file_name)
    
    print('trained models and necessary parameters for testing obtained')
    
    
    #getting url data
    clean_url = phish_data_1["URL"]

    #Feature preparation for test data
    len_url = []
    for ur in clean_url:
        len_url.append(len(ur))

    phish_data_1['URL_length'] = pd.Series(len_url)


    len_slashes = []
    for ur in clean_url:
        len_slashes.append(ur.count('/') )

    phish_data_1['URL_slashes'] = pd.Series(len_slashes)


    len_dots = []
    for ur in clean_url:
        len_dots.append(ur.count('.') )

    phish_data_1['URL_dots'] = pd.Series(len_dots)


    len_host = []
    start = '://'
    end = '/'

    for ur in clean_url:
        temp = ur[ur.find(start)+2*len(start) + 2: ur.rfind(end)]
        temp = temp.replace('/','.')
        temp = temp.replace('-','.')
        len_host.append(len(temp.split('.')))

    phish_data_1['URL_host'] = pd.Series(len_host)

    #Extracting numerical features for model
    num_data = phish_data_1[['create_age(months)', 'expiry_age(months)', 'update_age(days)', 'URL_length', 'URL_slashes', 'URL_dots', 'URL_host']].values
    num_lab = phish_data_1["Label"].values
    
    #Scaling the numerical data with model fitted on training data
    num_data_scaled = sscaler.transform(num_data)
    num_data = num_data_scaled #features for model 1


    #Tokenizing URL data with model fitted on training data
    tokenizer.fit_on_texts(clean_url)
    seq_length = 5 #Number of items in each sequence
    sequences = tokenizer.texts_to_sequences(clean_url)
    data = pad_sequences(sequences, maxlen=seq_length) #features for model 3


    #Loading models
    print('Loading model 1')
    loaded_best_model_1 = pickle.load(open(model_name_1, 'rb'))
    best_model_1_scores = loaded_best_model_1.predict_proba(num_data)[:,1]

    print('Loading model 2')
    loaded_best_model_2 = pickle.load(open(model_name_2, 'rb'))
    best_model_2_scores = loaded_best_model_2.predict_proba(clean_url)[:,1]

    print('Loading model 3')
    loaded_best_model_3 = load_model(model_name_3)
    best_model_3_scores = loaded_best_model_3.predict_proba(data)
    best_model_3_scores = best_model_3_scores.reshape(len(best_model_3_scores),)

    best_fusion = best_model_1_scores + best_model_2_scores + best_model_3_scores


    fusion_pred = []

    for val in best_fusion:
        if val > best_thr:
            fusion_pred.append(1)
        else:
            fusion_pred.append(0)

    fusion_pred = np.array(fusion_pred)
    
    print('Printing performance metrics - F1-score and confusion matrices')
    
    print('F1-score and confusion matrix from model 1: ')
    print(f1_score(num_lab, loaded_best_model_1.predict(num_data)))
    print(confusion_matrix(num_lab, loaded_best_model_1.predict(num_data)))
    print('')

    print('F1-score and confusion matrix from model 2: ')
    print(f1_score(num_lab, loaded_best_model_2.predict(clean_url)))
    print(confusion_matrix(num_lab, loaded_best_model_2.predict(clean_url)))
    print('')

    print('F1-score and confusion matrix from model 3: ')
    print(f1_score(num_lab, loaded_best_model_3.predict_classes(data)))
    print(confusion_matrix(num_lab, loaded_best_model_3.predict_classes(data)))
    print('')

    print('F1-score and confusion matrix from combined model: ')
    print(f1_score(num_lab, fusion_pred))
    print(confusion_matrix(num_lab, fusion_pred))
    print('')
    
    final_f1_score = f1_score(num_lab, fusion_pred)
    
    print('Combined model F1-score returned')
    
    return final_f1_score

In [9]:
train_file = 'FIU_Phishing_Mitre_Dataset_split_1.csv'

In [10]:
test_file = 'FIU_Phishing_Mitre_Dataset_split_2.csv'

In [11]:
final_score = test_model(train_file_name=train_file, test_file_name=test_file)

Feature engineering completed
random seed value:  38
model 1 training performance
1
0.9215686274509803
Model 1 saved
Model 2 saved
Train on 3096 samples, validate on 344 samples
Epoch 1/10
 - 1s - loss: 0.6828 - acc: 0.6437 - val_loss: 0.6541 - val_acc: 0.8808
Epoch 2/10
 - 0s - loss: 0.5638 - acc: 0.8346 - val_loss: 0.4474 - val_acc: 0.8401
Epoch 3/10
 - 0s - loss: 0.4020 - acc: 0.8521 - val_loss: 0.3440 - val_acc: 0.8779
Epoch 4/10
 - 0s - loss: 0.3310 - acc: 0.8753 - val_loss: 0.2978 - val_acc: 0.8837
Epoch 5/10
 - 0s - loss: 0.2621 - acc: 0.9076 - val_loss: 0.2600 - val_acc: 0.8953
Epoch 6/10
 - 0s - loss: 0.2166 - acc: 0.9276 - val_loss: 0.2349 - val_acc: 0.9128
Epoch 7/10
 - 0s - loss: 0.1772 - acc: 0.9448 - val_loss: 0.2162 - val_acc: 0.9215
Epoch 8/10
 - 0s - loss: 0.1435 - acc: 0.9622 - val_loss: 0.2025 - val_acc: 0.9273
Epoch 9/10
 - 0s - loss: 0.1173 - acc: 0.9713 - val_loss: 0.1946 - val_acc: 0.9302
Epoch 10/10
 - 0s - loss: 0.0963 - acc: 0.9777 - val_loss: 0.1929 - val_acc