In [1]:
# Import relevant libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pickle

sns.set()

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Classification metrics and dataset division
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [4]:
#Scikit-learn ML models
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [5]:
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

Using TensorFlow backend.


In [6]:
# NLP 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re

In [7]:
#Feature engineering from scikit-learn for text based columns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

In [8]:
#nltk.download('wordnet')
filename_1 = "FIU_Phishing_Mitre_Dataset_split_1.csv" #Training csv file here
filename_2 = "FIU_Phishing_Mitre_Dataset_split_2.csv" #Testing csv file here

In [9]:
def phishing(filename_1, filename_2):

    phish_data = pd.read_csv(filename_1)

    
    #Creating additional features
    clean_url = phish_data["URL"]

    
    #Length of URL
    len_url = []

    for ur in clean_url:
        len_url.append(len(ur))

    phish_data['URL_length'] = pd.Series(len_url)

    
    #Number of slashes in the URL
    len_slashes = []

    for ur in clean_url:
        len_slashes.append(ur.count('/') )

    phish_data['URL_slashes'] = pd.Series(len_slashes)

    
    #Number of dots in the URL
    len_dots = []

    for ur in clean_url:
        len_dots.append(ur.count('.') )

    phish_data['URL_dots'] = pd.Series(len_dots)

    
    #Number of words in the host name of URL
    len_host = []

    start = '://'
    end = '/'

    for ur in clean_url:
        temp = ur[ur.find(start)+2*len(start) + 2: ur.rfind(end)]
        temp = temp.replace('/','.')
        temp = temp.replace('-','.')
        len_host.append(len(temp.split('.')))

    phish_data['URL_host'] = pd.Series(len_host)

    total_word_count = 5000
    tokenizer = Tokenizer(num_words=total_word_count)
    tokenizer.fit_on_texts(clean_url)

    seq_length = 5 #Number of items in each sequence
    sequences = tokenizer.texts_to_sequences(clean_url)
    data = pad_sequences(sequences, maxlen=seq_length)

    num_data = phish_data[['create_age(months)', 'expiry_age(months)', 'update_age(days)', 'URL_length', 'URL_slashes', 'URL_dots', 'URL_host']].values
    num_lab = phish_data["Label"].values

    sscaler = StandardScaler()
    num_data_scaled = sscaler.fit_transform(num_data)
    num_data = num_data_scaled

    random_seed_val = 10

    X_train, X_test, Y_train, Y_test = train_test_split(num_data_scaled, num_lab, test_size = 0.2, random_state = random_seed_val)

    names = ["Log-Reg", "Nearest Neighbors",
             "Decision Tree", "Random Forest", "AdaBoost"]

    classifiers = [
        LogisticRegression(),
        KNeighborsClassifier(5),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, criterion = 'gini', max_features = 'log2', n_estimators = 50),
        AdaBoostClassifier()]


    y_pred_mat_num = np.zeros((len(X_test), len(names)))
    y_pred_mat_num_train = np.zeros((len(X_train), len(names)))

    f1_vals = []
    f1_vals_train = []

    i = 0

    models_1 = []

    for name, clf in zip(names, classifiers):

        clf.fit(X_train, Y_train)
        score = clf.score(X_test, Y_test)
        y_hat = clf.predict(X_test)

        y_pred_mat_num[:,i] = clf.predict_proba(X_test)[:,1]
        y_pred_mat_num_train[:,i] = clf.predict_proba(X_train)[:,1]

        f1_vals.append(f1_score(Y_test, y_hat))
        f1_vals_train.append(f1_score(Y_train, clf.predict(X_train)))

        models_1.append(clf)

        i += 1

    y_num = y_pred_mat_num[:,np.argmax(f1_vals)]
    y_num_train = y_pred_mat_num_train[:,np.argmax(f1_vals_train)]

    best_model_1 = models_1[np.argmax(f1_vals)]

    filename = 'best_model_1.sav'
    pickle.dump(best_model_1, open(filename, 'wb'))

    ## Text based features with Machine learning models

    X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(clean_url, num_lab, test_size = 0.2, random_state = random_seed_val)

    y_pred_mat_text = np.zeros((len(X_test_text), len(names)))
    y_pred_mat_text_train = np.zeros((len(X_train_text), len(names)))

    f1_vals_text = []
    f1_vals_text_train = []

    i = 0

    models_2 = []

    for name, clf in zip(names, classifiers):


        classifier = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', clf)])

        classifier.fit(X_train_text, y_train_text)
        y_hat_text = classifier.predict(X_test_text)

        y_pred_mat_text[:,i] = classifier.predict_proba(X_test_text)[:,1]
        y_pred_mat_text_train[:,i] = classifier.predict_proba(X_train_text)[:,1]

        f1_vals_text.append(f1_score(y_test_text, y_hat_text))
        f1_vals_text_train.append(f1_score(y_train_text, classifier.predict(X_train_text)))


        models_2.append(classifier)
        i += 1


    y_text = y_pred_mat_text[:,np.argmax(f1_vals_text)]
    y_text_train = y_pred_mat_text_train[:,np.argmax(f1_vals_text_train)]


    best_model_2 = models_2[np.argmax(f1_vals_text)]

    filename = 'best_model_2.sav'
    pickle.dump(best_model_2, open(filename, 'wb'))

    # Text based features with Deep Learning

    x_train, x_test, y_train, y_test = train_test_split(data, num_lab, test_size=0.2, random_state=random_seed_val)

    model = Sequential()
    model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
    model.add(LSTM(seq_length, dropout=0.3, recurrent_dropout=0.3))
    #model.add(Dense(5, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    ## Fit the model
    callbacks = [EarlyStopping(monitor='val_loss', patience=3),
                 ModelCheckpoint(filepath='lstm_url.h5', monitor='val_loss', save_best_only=True)]

    history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 0)

    best_model = load_model('lstm_url.h5')

    y_pred_lstm = best_model.predict_classes(x_test)

    y_pred_lstm_score = best_model.predict_proba(x_test)
    y_pred_lstm_score_train = best_model.predict_proba(x_train)

    y_pred_lstm_score = y_pred_lstm_score.reshape(len(y_pred_lstm_score),)
    y_pred_lstm_score_train = y_pred_lstm_score_train.reshape(len(y_pred_lstm_score_train),)

    fusion = y_num + y_text 
    #+ y_pred_lstm_score
    fusion_train = y_num_train + y_text_train 
    #+ y_pred_lstm_score_train

    f1_fusion_train = []

    thr_range = np.arange(0.5, 2, 0.05)

    for thr in thr_range:
        fusion_pred_train = []

        for val in fusion_train:
            if val > thr:
                fusion_pred_train.append(1)
            else:
                fusion_pred_train.append(0)


        fusion_pred_train = np.array(fusion_pred_train)

        f1_fusion_train.append(f1_score(y_train_text, fusion_pred_train))

    best_thr = thr_range[np.argmax(f1_fusion_train)]

#    print(best_thr)

    fusion_pred = []

    for val in fusion:
        if val > best_thr:
            fusion_pred.append(1)
        else:
            fusion_pred.append(0)

    fusion_pred = np.array(fusion_pred)



    
    #Testing phase begins here

    phish_data_1 = pd.read_csv(filename_2)

    clean_url = phish_data_1["URL"].values #features for model 2

    tokenizer.fit_on_texts(clean_url)

    seq_length = 5 #Number of items in each sequence

    sequences = tokenizer.texts_to_sequences(clean_url)
    data = pad_sequences(sequences, maxlen=seq_length) #features for model 3

    len_url = []

    for ur in clean_url:
        len_url.append(len(ur))

    phish_data_1['URL_length'] = pd.Series(len_url)

    len_slashes = []

    for ur in clean_url:
        len_slashes.append(ur.count('/') )

    phish_data_1['URL_slashes'] = pd.Series(len_slashes)

    len_dots = []

    for ur in clean_url:
        len_dots.append(ur.count('.') )

    phish_data_1['URL_dots'] = pd.Series(len_dots)

    len_host = []

    start = '://'
    end = '/'
    #print s[s.find(start)+len(start):s.rfind(end)]

    for ur in clean_url:
        temp = ur[ur.find(start)+2*len(start) + 2: ur.rfind(end)]
        temp = temp.replace('/','.')
        temp = temp.replace('-','.')
        len_host.append(len(temp.split('.')))

    phish_data_1['URL_host'] = pd.Series(len_host)

    num_data = phish_data_1[['create_age(months)', 'expiry_age(months)', 'update_age(days)', 'URL_length', 'URL_slashes', 'URL_dots', 'URL_host']].values
    num_lab = phish_data_1["Label"].values

    num_data_scaled = sscaler.transform(num_data)
    num_data = num_data_scaled #features for model 1

    loaded_best_model_1 = pickle.load(open('best_model_1.sav', 'rb'))
    best_model_1_scores = loaded_best_model_1.predict_proba(num_data)[:,1]

    clean_url = phish_data_1["URL"].values

    loaded_best_model_2 = pickle.load(open('best_model_2.sav', 'rb'))
    best_model_2_scores = loaded_best_model_2.predict_proba(clean_url)[:,1]

    loaded_best_model_3 = load_model('lstm_url.h5')
    best_model_3_scores = loaded_best_model_3.predict_proba(data)
    best_model_3_scores = best_model_3_scores.reshape(len(best_model_3_scores),)

    best_fusion = best_model_1_scores + best_model_2_scores 
    #+ best_model_3_scores

    fusion_pred = []

    for val in best_fusion:
        if val > best_thr:
            fusion_pred.append(1)
        else:
            fusion_pred.append(0)

    fusion_pred = np.array(fusion_pred)

    print('F1-score from model 1: ')
    print(f1_score(num_lab, loaded_best_model_1.predict(num_data)))
    print(confusion_matrix(num_lab, loaded_best_model_1.predict(num_data)))
    print('')
    print('F1-score from model 2: ')
    print(f1_score(num_lab, loaded_best_model_2.predict(clean_url)))
    print(confusion_matrix(num_lab, loaded_best_model_2.predict(clean_url)))
    print('')
    print('F1-score from model 3: ')
    print(f1_score(num_lab, loaded_best_model_3.predict_classes(data)))
    print(confusion_matrix(num_lab, loaded_best_model_3.predict_classes(data)))

    print('')
    print('F1-score from combined model (model_1+model_2) : ')
    print(f1_score(num_lab, fusion_pred))
    print(confusion_matrix(num_lab, fusion_pred))

    return f1_score(num_lab, fusion_pred)



In [10]:
phishing(filename_1, filename_2)

[[231  13]
 [ 22 233]]
F1-score from model 1: 
0.9087523277467412
[[206  38]
 [ 11 244]]

F1-score from model 2: 
0.9008264462809917
[[233  11]
 [ 37 218]]

F1-score from model 3: 
0.8486055776892432
[[210  34]
 [ 42 213]]

F1-score from combined model (model_1+model_2: 
0.9301397205588823
[[231  13]
 [ 22 233]]


0.9301397205588823