# Libraries

In [1]:
import math
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, f1_score, precision_score, recall_score, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate


from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Read in data

In [3]:
df = pd.read_csv(r"c:\Users\magni\Documents\Classes\MIS581 Capstone\data\dataset_full_preprocessed.csv")
df.head()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,3,0,0,1,0,0,0,0,0,0,...,1,2,0,892,0,0,0,0,0,1
1,5,0,1,3,0,3,0,2,0,0,...,1,2,1,9540,1,0,0,0,0,1
2,2,0,0,1,0,0,0,0,0,0,...,1,2,3,589,1,0,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,1,2,0,292,1,0,0,0,0,1
4,2,0,0,0,0,0,0,0,0,0,...,1,2,1,3597,0,1,0,0,0,0


In [4]:
df.columns

Index(['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url',
       'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url',
       'qty_exclamation_url', 'qty_space_url', 'qty_tilde_url',
       'qty_comma_url', 'qty_plus_url', 'qty_asterisk_url', 'qty_hashtag_url',
       'qty_dollar_url', 'qty_percent_url', 'qty_tld_url', 'length_url',
       'email_in_url', 'qty_dot_domain', 'qty_hyphen_domain',
       'qty_underline_domain', 'qty_at_domain', 'qty_vowels_domain',
       'server_client_domain', 'domain_length', 'domain_in_ip',
       'qty_dot_directory', 'qty_hyphen_directory', 'qty_underline_directory',
       'qty_slash_directory', 'qty_equal_directory', 'qty_at_directory',
       'qty_and_directory', 'qty_exclamation_directory', 'qty_space_directory',
       'qty_tilde_directory', 'qty_comma_directory', 'qty_plus_directory',
       'qty_asterisk_directory', 'qty_dollar_directory',
       'qty_percent_directory', 'directory_length', 'qty_dot_file',
  

In [5]:
df.shape

(88647, 92)

# Split train/test

In [6]:
X = df.iloc[:,:-1]
y = df['phishing']

In [7]:
X.columns

Index(['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url',
       'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url',
       'qty_exclamation_url', 'qty_space_url', 'qty_tilde_url',
       'qty_comma_url', 'qty_plus_url', 'qty_asterisk_url', 'qty_hashtag_url',
       'qty_dollar_url', 'qty_percent_url', 'qty_tld_url', 'length_url',
       'email_in_url', 'qty_dot_domain', 'qty_hyphen_domain',
       'qty_underline_domain', 'qty_at_domain', 'qty_vowels_domain',
       'server_client_domain', 'domain_length', 'domain_in_ip',
       'qty_dot_directory', 'qty_hyphen_directory', 'qty_underline_directory',
       'qty_slash_directory', 'qty_equal_directory', 'qty_at_directory',
       'qty_and_directory', 'qty_exclamation_directory', 'qty_space_directory',
       'qty_tilde_directory', 'qty_comma_directory', 'qty_plus_directory',
       'qty_asterisk_directory', 'qty_dollar_directory',
       'qty_percent_directory', 'directory_length', 'qty_dot_file',
  

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Read in pretrained base classifier

In [9]:
rf_clf_op = joblib.load("models/rf.joblib")
rf_clf_op

RandomForestClassifier(n_estimators=10, random_state=0)

In [11]:
et_clf_op = joblib.load("models/et.joblib")

In [13]:
knn_clf_op = joblib.load("models/knn.joblib")

In [15]:
svc_clf_op = joblib.load("models/svc_10k.joblib")

In [17]:
lr_clf_op = joblib.load("models/lr.joblib")

In [19]:
adab_clf_op = joblib.load("models/abc.joblib")

In [24]:
sclf_op = joblib.load("models/sclf_default.joblib")
sclf_op

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(n_estimators=10,
                                                       random_state=0)),
                               ('knn', KNeighborsClassifier(n_neighbors=2)),
                               ('xgb',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1, gamma=0,
                                              gpu_id=-1, importance_type='gain',
                                              interaction_constraints='',
                                              learning_rate=0.300000012,
                                              max_delt...
                                              num_parallel_tree=1,
                                              random_state=0

# Cross Validation Scores

In [48]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [25]:
cv_results_rf = cross_validate(rf_clf_op, X_test, y_test, cv=10, scoring=('accuracy','r2','roc_auc','precision','recall','f1'))
cv_results_rf

{'fit_time': array([0.25945234, 0.15179586, 0.15406489, 0.16420484, 0.16378164,
        0.15579176, 0.15978742, 0.15978837, 0.15582228, 0.15975785]),
 'score_time': array([0.05590844, 0.01600957, 0.01601005, 0.01526809, 0.01594925,
        0.01997209, 0.01600695, 0.01594782, 0.01597738, 0.01600647]),
 'test_accuracy': array([0.95375071, 0.96164693, 0.95769882, 0.96728708, 0.95375071,
        0.95205866, 0.96108291, 0.95939086, 0.95262267, 0.96108291]),
 'test_r2': array([0.79522579, 0.83018724, 0.8128515 , 0.85527182, 0.7953843 ,
        0.78789836, 0.82782338, 0.82033744, 0.79039368, 0.82782338]),
 'test_roc_auc': array([0.9870384 , 0.98784476, 0.98768528, 0.98919331, 0.98855857,
        0.98663396, 0.98705407, 0.98566356, 0.98366154, 0.98803502]),
 'test_precision': array([0.94156928, 0.94876033, 0.9408867 , 0.95261438, 0.94314381,
        0.93989983, 0.94876033, 0.93973941, 0.92996743, 0.95630252]),
 'test_recall': array([0.92307692, 0.93944354, 0.93627451, 0.95261438, 0.92156863,
 

In [26]:
cv_results_sclf = cross_validate(sclf_op, X_test, y_test, cv=10, scoring=('accuracy','r2','roc_auc','precision','recall','f1'))
cv_results_sclf

{'fit_time': array([20.69032288, 20.67618275, 20.66428185, 20.48184371, 20.07361388,
        19.87319183, 20.09349036, 20.68197417, 20.70291924, 21.0216713 ]),
 'score_time': array([0.93734956, 0.93964505, 0.91222787, 0.89932632, 0.89204168,
        0.88762522, 0.89145231, 0.92352891, 0.94845462, 0.89856577]),
 'test_accuracy': array([0.95882685, 0.96446701, 0.96559504, 0.97292724, 0.95995488,
        0.95600677, 0.96108291, 0.95995488, 0.95882685, 0.96390299]),
 'test_r2': array([0.81770101, 0.84267348, 0.84778588, 0.88022496, 0.82283275,
        0.80536556, 0.82782338, 0.82283275, 0.81784212, 0.84029994]),
 'test_roc_auc': array([0.99159274, 0.99391393, 0.99266043, 0.99468426, 0.99382012,
        0.98990334, 0.9930925 , 0.99122489, 0.98786684, 0.99426627]),
 'test_precision': array([0.93527508, 0.9433657 , 0.9450727 , 0.95483871, 0.94417077,
        0.94352159, 0.94290375, 0.94271686, 0.9312    , 0.9551495 ]),
 'test_recall': array([0.94599018, 0.95417349, 0.95588235, 0.96732026, 0.9

# T-Test for score comparison

In [28]:
from scipy.stats import ttest_rel

In [30]:
#H0: p1=p2
#HA: p1<p2
# Method paired ona-tailed t-test
ttest, pval = ttest_rel(cv_results_rf['test_accuracy'], cv_results_sclf['test_accuracy'], alternative='less')
print("t-test", '{0:.10f}'.format(ttest))
print("p-value", '{0:.10f}'.format(pval))

if pval < 0.05:
    print("we reject null hypothesis")
else:
    print("we accept null hypothesis")

t-test -5.0765764209
p-value 0.0003329567
we reject null hypothesis
