# **Simple binary classifier** 

This notebook will generate a binary classifier that will identify if an attack is benign or not. The following algorithms will be used:
* Logistic regression 
* Random forest 
* CatBoost

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import imblearn
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import scikitplot as skplt
import eli5
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_recall_curve, average_precision_score, roc_auc_score
from catboost import CatBoostClassifier
from catboost import Pool
from eli5.sklearn import PermutationImportance
from scipy.stats import spearmanr
from scipy.cluster import hierarchy 
from scipy.spatial.distance import squareform
from collections import defaultdict
from scipy.stats import ks_2samp
from scipy.stats import describe
from imblearn.over_sampling import SMOTENC
from collections import Counter
from IPython.display import display

In [3]:
def get_data():
    data_path = os.path.dirname(os.path.abspath('')) + "/data/combined/"
    df = pd.read_pickle(data_path+'combined_cleaned.pickle')
    return df

def del_std_equal_to_zero(d):
    print(d.shape)
    counts = d.nunique()
    to_del = [i for i,v in enumerate(counts) if v == 1]
    print(d.columns[to_del])
    print(to_del)
    d = d.drop(d.columns[to_del], axis = 1)
    print(d.shape)
    return d

def correlations(d):
    # plt.figure(figsize=(15,10))
    # placing the deature dataset into a spearmanr structure and then into a correlation
    corr_matrix = d.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    # Drop features 
    df.drop(df[to_drop], axis=1)
    return d

def print_report(ds_type, cls, X_vals, y_true, y_predict, plot_pr=False, plot_roc=False):
    print(f"Classification Report ({ds_type}):")
    print(classification_report(y_true, y_predict))
    print(f"Avg Precision Score: {average_precision_score(y_true, y_predict, average='weighted')}")
    
    if plot_roc:
        print(f"ROC AUC Score: {roc_auc_score(y_true, y_predict)}")
        skplt.metrics.plot_roc(y_true, cls.predict_proba(X_vals))
        plt.show()
        
    if plot_pr:
        skplt.metrics.plot_precision_recall(y_true, cls.predict_proba(X_vals))
        plt.show()
        
    print('\n')

def split_data(X, y, size, label):
    return train_test_split(X, y, test_size=size, stratify=label)

def one_hot_encoding(x, col):
    df = pd.get_dummies(x, columns=col)
    return df

def dummy_classifier(x,y):
    cls_dum = DummyClassifier('most_frequent')
    return cls_dum.fit(x, y)

def scale_data(x):
    scaledd = StandardScaler()
    return scaledd.fit(x)

def log_reg(x, y, sd):
    log_reg = LogisticRegression(solver='saga', n_jobs=-1, verbose=2)
    return log_reg.fit(sd.transform(x), y)

def random_f(x,y):
    rf = RandomForestClassifier(verbose=1, n_jobs=-1, class_weight='balanced')
    return rf.fit(x, y)

def calculate_misclassifiations(y, predictions):
    classifications = y.copy()
    classifications['pred'] = predictions
    counts = classifications.label.value_counts()
    
    misclassifications = classifications[classifications.detect_threat != classifications.pred]
    mc = pd.DataFrame(misclassifications.label.value_counts())
    mc['percentage'] = mc.apply(lambda x: x[0] / counts[x.name], axis=1)
    mc = mc.sort_values('percentage', ascending=False)
    
    print('Misclassifications:')
    display(mc)

df = get_data() 

X = df.drop(columns=['label', 'detect_type', 'detect_threat'])
y = df[['label', 'detect_type', 'detect_threat']]
X = del_std_equal_to_zero(X)
X = X.drop(columns=['timestamp', 'dst_port'])
X = correlations(X)



X_train, X_hold, y_train, y_hold = split_data(X, y, 0.3, y.detect_type)
X_eval, X_test, y_eval, y_test = split_data(X_hold, y_hold, 0.7, y_hold.detect_type)

X_train_oh = one_hot_encoding(X_train, ['protocol'])
X_eval_oh = one_hot_encoding(X_eval, ['protocol'])
X_test_oh = one_hot_encoding(X_test, ['protocol'])

# X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size=0.3, stratify=y.detect_type)
# X_eval, X_test, y_eval, y_test = train_test_split(X_hold, y_hold, test_size=0.6, stratify=y_hold.detect_type)

# X_train_oh = pd.get_dummies(X_train, columns=['protocol'])
# X_eval_oh = pd.get_dummies(X_eval, columns=['protocol'])
# X_test_oh = pd.get_dummies(X_test, columns=['protocol'])

scaled_data = scale_data(X_train_oh)



X_train_oh = scaled_data.transform(X_train_oh)
X_eval_oh = scaled_data.transform(X_eval_oh)
X_test_oh = scaled_data.transform(X_test_oh)


# rf = random_f(X_train_oh, y_train.detect_threat)

# print_report('Train', rf, X_train_oh, y_train.detect_threat, rf.predict(X_train_oh))
# print_report('Eval', rf, X_eval_oh, y_eval.detect_threat, rf.predict(X_eval_oh), plot_pr=True)





(16137183, 79)
Index(['bwd_psh_flags', 'bwd_urg_flags', 'fwd_byts_b_avg', 'fwd_pkts_b_avg',
       'fwd_blk_rate_avg', 'bwd_byts_b_avg', 'bwd_pkts_b_avg',
       'bwd_blk_rate_avg'],
      dtype='object')
[33, 35, 57, 58, 59, 60, 61, 62]
(16137183, 71)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [10]:

from sklearn.decomposition import PCA
# Performing the principal component analysis. With just 19 components the variance ratio remains 99%, which is great.

pca = PCA(0.99)
pca.fit(X_train_oh)

X_train_oh = pca.transform(X_train_oh)
X_eval_oh = pca.transform(X_eval_oh)
X_test_oh = pca.transform(X_test_oh)



np.sum(pca.explained_variance_ratio_)

0.9911467977897559

In [12]:
pca.n_components_

23

In [14]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features 
df.drop(df[to_drop], axis=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


Unnamed: 0,dst_port,protocol,timestamp,flow_duration,tot_fwd_pkts,tot_bwd_pkts,totlen_fwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,...,init_bwd_win_byts,fwd_seg_size_min,active_mean,active_std,active_max,active_min,idle_min,label,detect_type,detect_threat
0,443,6,28/02/2018 08:22:13,94658,6,7,708,387,0,118.000000,...,7484,20,0.0,0.0,0,0,0,Benign,0,0
1,443,6,28/02/2018 08:22:13,206,2,0,0,0,0,0.000000,...,-1,20,0.0,0.0,0,0,0,Benign,0,0
2,445,6,28/02/2018 08:22:15,165505,3,1,0,0,0,0.000000,...,8192,20,0.0,0.0,0,0,0,Benign,0,0
3,443,6,28/02/2018 08:22:16,102429,6,7,708,387,0,118.000000,...,7484,20,0.0,0.0,0,0,0,Benign,0,0
4,443,6,28/02/2018 08:22:16,167,2,0,0,0,0,0.000000,...,-1,20,0.0,0.0,0,0,0,Benign,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16232938,80,6,14/02/2018 10:53:23,10156986,5,5,1089,587,0,217.800003,...,31111,20,0.0,0.0,0,0,0,Benign,0,0
16232939,80,6,14/02/2018 10:53:33,117,2,0,0,0,0,0.000000,...,-1,20,0.0,0.0,0,0,0,Benign,0,0
16232940,80,6,14/02/2018 10:53:28,5095331,3,1,0,0,0,0.000000,...,29200,20,0.0,0.0,0,0,0,Benign,0,0
16232941,80,6,14/02/2018 10:53:28,5235511,3,1,0,0,0,0.000000,...,42780,20,0.0,0.0,0,0,0,Benign,0,0


In [None]:
from imbalanced_ensemble.ensemble import SelfPacedEnsembleClassifier
from sklearn.datasets import make_classification


In [None]:
clf = SelfPacedEnsembleClassifier(random_state=0)
clf.fit(X, y.detect_threat)
clf.predict(X)  