# Libraries

In [18]:
import math
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, f1_score, precision_score, recall_score, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate

from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Read in data

In [3]:
df = pd.read_csv(r"c:\Users\magni\Documents\Classes\MIS581 Capstone\data\dataset_full_preprocessed.csv")
df.head()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,3,0,0,1,0,0,0,0,0,0,...,1,2,0,892,0,0,0,0,0,1
1,5,0,1,3,0,3,0,2,0,0,...,1,2,1,9540,1,0,0,0,0,1
2,2,0,0,1,0,0,0,0,0,0,...,1,2,3,589,1,0,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,1,2,0,292,1,0,0,0,0,1
4,2,0,0,0,0,0,0,0,0,0,...,1,2,1,3597,0,1,0,0,0,0


In [4]:
df.columns

Index(['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url',
       'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url',
       'qty_exclamation_url', 'qty_space_url', 'qty_tilde_url',
       'qty_comma_url', 'qty_plus_url', 'qty_asterisk_url', 'qty_hashtag_url',
       'qty_dollar_url', 'qty_percent_url', 'qty_tld_url', 'length_url',
       'email_in_url', 'qty_dot_domain', 'qty_hyphen_domain',
       'qty_underline_domain', 'qty_at_domain', 'qty_vowels_domain',
       'server_client_domain', 'domain_length', 'domain_in_ip',
       'qty_dot_directory', 'qty_hyphen_directory', 'qty_underline_directory',
       'qty_slash_directory', 'qty_equal_directory', 'qty_at_directory',
       'qty_and_directory', 'qty_exclamation_directory', 'qty_space_directory',
       'qty_tilde_directory', 'qty_comma_directory', 'qty_plus_directory',
       'qty_asterisk_directory', 'qty_dollar_directory',
       'qty_percent_directory', 'directory_length', 'qty_dot_file',
  

In [6]:
df.shape

(88647, 92)

# Split train/test

In [7]:
X = df.iloc[:,:-1]
y = df['phishing']

In [8]:
X.columns

Index(['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url',
       'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url',
       'qty_exclamation_url', 'qty_space_url', 'qty_tilde_url',
       'qty_comma_url', 'qty_plus_url', 'qty_asterisk_url', 'qty_hashtag_url',
       'qty_dollar_url', 'qty_percent_url', 'qty_tld_url', 'length_url',
       'email_in_url', 'qty_dot_domain', 'qty_hyphen_domain',
       'qty_underline_domain', 'qty_at_domain', 'qty_vowels_domain',
       'server_client_domain', 'domain_length', 'domain_in_ip',
       'qty_dot_directory', 'qty_hyphen_directory', 'qty_underline_directory',
       'qty_slash_directory', 'qty_equal_directory', 'qty_at_directory',
       'qty_and_directory', 'qty_exclamation_directory', 'qty_space_directory',
       'qty_tilde_directory', 'qty_comma_directory', 'qty_plus_directory',
       'qty_asterisk_directory', 'qty_dollar_directory',
       'qty_percent_directory', 'directory_length', 'qty_dot_file',
  

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base classifiers

In [None]:
RANDOM_SEED = 0

## Random Forest

In [38]:
rf_clf = RandomForestClassifier(n_estimators=10, random_state=RANDOM_SEED)
rf_clf.fit(X_train, y_train)
joblib.dump(rf_clf, "models/rf.joblib")

['models/rf.joblib']

## ExtraTreesClassifier

In [39]:
et_clf = ExtraTreesClassifier(n_estimators=5, random_state=RANDOM_SEED)
et_clf.fit(X_train, y_train)
joblib.dump(et_clf, "models/et.joblib")

['models/et.joblib']

## KNeighborsClassifier

In [40]:
knn_clf = KNeighborsClassifier(n_neighbors=2)
knn_clf.fit(X_train, y_train)
joblib.dump(knn_clf, "models/knn.joblib")

['models/knn.joblib']

## XGBClassifier

In [41]:
xgb_clf = XGBClassifier(n_estimators =10, random_state=RANDOM_SEED, verbosity=0)
xgb_clf.fit(X_train, y_train)
joblib.dump(xgb_clf, "models/xgb.joblib")

['models/xgb.joblib']

## LogisticRegression

In [42]:
lr_clf = LogisticRegression(C=20000, penalty='l2', random_state=RANDOM_SEED)
lr_clf.fit(X_train, y_train)
joblib.dump(lr_clf, "models/lr.joblib")

['models/lr.joblib']

## AdaBoostClassifier

In [43]:
adab_clf = AdaBoostClassifier(n_estimators=100, random_state=RANDOM_SEED)
adab_clf.fit(X_train, y_train)
joblib.dump(adab_clf, "models/abc.joblib")

['models/abc.joblib']

# MetaClassifier

In [33]:
# use probabilities for the meta-model

lr = LogisticRegression(random_state=RANDOM_SEED) # meta classifier

sclf_cv = StackingClassifier(estimators=[('rf', rf_clf), ('knn', knn_clf), ('xgb', xgb_clf), ('lr', lr_clf),
                                      ('et', et_clf), ('ad', adab_clf)], stack_method='auto', final_estimator=lr)

classifier_array = [rf_clf, knn_clf, xgb_clf, lr_clf, et_clf, adab_clf, sclf_cv]
labels_classifiers = [clf.__class__.__name__ for clf in classifier_array]

acc_list = []
var_list = []

for clf, label in zip(classifier_array, labels_classifiers):
    #cv_scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    clf = clf.fit(X_train, y_train)
    cv_scores = cross_val_score(clf, X_test, y_test, cv=5, scoring='accuracy')
    acc_list.append(np.round(cv_scores.mean(),4))
    var_list.append(np.round(cv_scores.std(),4))
    #print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (cv_scores.mean(), cv_scores.std(), label))
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (cv_scores.mean(), cv_scores.std(), label))

Start Training
Finished Training
Accuracy: 0.9560 (+/- 0.0029) [RandomForestClassifier]
Accuracy: 0.8260 (+/- 0.0042) [KNeighborsClassifier]
Accuracy: 0.9538 (+/- 0.0023) [XGBClassifier]
Accuracy: 0.8866 (+/- 0.0136) [LogisticRegression]
Accuracy: 0.9506 (+/- 0.0025) [ExtraTreesClassifier]
Accuracy: 0.9445 (+/- 0.0037) [AdaBoostClassifier]
Accuracy: 0.9608 (+/- 0.0035) [StackingClassifier]


In [37]:
joblib.dump(sclf_cv, "models/sclf_default.joblib")

['models/sclf_default.joblib']