In [1]:
from scipy.io.arff import loadarff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
raw_data = loadarff("Training Dataset.arff")
df = pd.DataFrame(raw_data[0])
for c in df.columns:
    df[c] = df[c].apply(lambda x: int(x.decode('ascii'))) # remove weird binary values, leave as string for now
df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [3]:
X = df.loc[:, df.columns != "Result"]
y = df.loc[:, "Result"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# PCA combined with Sequential Feature Selection, hyperparameter tuning included

In [4]:
def correct_classification_rate(estimator, X, y):
    y_hat = estimator.predict(X)
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y[i]!=y_hat[i]:
           FP += 1
        if y[i]==y_hat[i]==-1:
           TN += 1
        if y_hat[i]==-1 and y[i]!=y_hat[i]:
           FN += 1

    return (TP + TN) / (TP + FP + FN + TN)

def true_positive_rate(estimator, X, y):
    y_hat = estimator.predict(X)
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y[i]!=y_hat[i]:
           FP += 1
        if y[i]==y_hat[i]==-1:
           TN += 1
        if y_hat[i]==-1 and y[i]!=y_hat[i]:
           FN += 1

    return TP/(TP+FN)

def true_negative_rate(estimator, X, y):
    y_hat = estimator.predict(X)
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y[i]!=y_hat[i]:
           FP += 1
        if y[i]==y_hat[i]==-1:
           TN += 1
        if y_hat[i]==-1 and y[i]!=y_hat[i]:
           FN += 1

    return TN/(TN+FP)

def geometric_mean(estimator, X, y):
    y_hat = estimator.predict(X)
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y[i]!=y_hat[i]:
           FP += 1
        if y[i]==y_hat[i]==-1:
           TN += 1
        if y_hat[i]==-1 and y[i]!=y_hat[i]:
           FN += 1

    TPR = TP/(TP + FN)
    TNR = TN/(TN + FP)
    return np.sqrt(TPR * TNR)

In [5]:
# BPNN
from sklearn.neural_network import MLPClassifier
# NB
from sklearn.naive_bayes import GaussianNB
# SVM
from sklearn.svm import SVC
# C4.5 - decision tree
from sklearn.tree import DecisionTreeClassifier
# knn
from sklearn.neighbors import KNeighborsClassifier
# RF
from sklearn.ensemble import RandomForestClassifier

# All the results will be stored in a single dictionary
optimal_features_data = dict()

In [49]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

def wrapper_feature_selection_with_grid_search(clf_class, param_grid_dict, X_train, X_test, y_train):
      clf1 = clf_class()
      clf2 = clf_class()
      
      sfs = SFS(estimator=clf1,
          k_features=(25, 30),
          forward=False,
          floating=False,
          scoring='accuracy',
          verbose=2,
          cv=5,
          n_jobs=6)
      pipe = Pipeline([('sfs', sfs),
                       ('clf2', clf2)])
      
      gs = GridSearchCV(estimator=pipe,
                        param_grid=param_grid_dict,
                        scoring='accuracy',
                        n_jobs=6,
                        cv=3,
                        refit=False)
      gs.fit(X_train, y_train)
      """
      sfs.fit(X_train, y_train)
      X_train_sfs = sfs.transform(X_train)
      X_test_sfs = sfs.transform(X_test)

      return gs.best_params_, X_train_sfs, X_test_sfs"""
      return gs

In [36]:
from mlxtend.preprocessing import standardize
from mlxtend.feature_extraction import PrincipalComponentAnalysis

pca = PrincipalComponentAnalysis(n_components=None)
X_train_standard = standardize(X_train)
pca.fit(X_train_standard)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [50]:
gb = GaussianNB
gs = wrapper_feature_selection_with_grid_search(gb, 
                                           {'clf2__var_smoothing': np.logspace(-9, 5, 15),
                                            'sfs__estimator__var_smoothing': np.logspace(-9, 5, 15)},
                                           X_train_pca,
                                           X_test_pca,
                                           y_train)

KeyboardInterrupt: 

In [33]:
print(correct_classification_rate(gb_good, X_test_sfs, y_test.to_numpy()))
print(true_positive_rate(gb_good, X_test_sfs, y_test.to_numpy()))
print(true_negative_rate(gb_good, X_test_sfs, y_test.to_numpy()))
print(geometric_mean(gb_good, X_test_sfs, y_test.to_numpy()))

0.89280868385346
0.9294019933554817
0.8490566037735849
0.8883213945519973
