## Importing the Libraries

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

In [8]:
#Read dataset
df = pd.read_csv('./data/CICIDS2017_sample.csv') 
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,DoS
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,DoS


In [9]:
df.Label.value_counts()

Label
BENIGN          22731
DoS             19035
PortScan         7946
BruteForce       2767
WebAttack        2180
Bot              1966
Infiltration       36
Name: count, dtype: int64

### Preprocessing (normalization and padding values)


In [10]:
# Z-score normalization
features = df.dtypes[df.dtypes != 'object'].index
df[features] = df[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# Fill empty values by 0
df = df.fillna(0)

In [29]:
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

In [30]:
df.Label.value_counts()

Label
0    22731
3    19035
5     7946
2     2767
6     2180
1     1966
4       36
Name: count, dtype: int64

Here the minority class instances are of the indexes 6,1,4 (the last 3 instances)

### split train set and test set using the sampled set

In [31]:
X = df.drop(['Label'],axis=1).values
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state = 0,stratify = y)

## Feature Engineering

### Feature selection by Information Gain

In [33]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X_train, y_train)

ValueError: Unknown label type: 'unknown'

#### Interpretation
Higher Scores: Features with higher mutual information scores are more informative about the target variable.
Lower Scores: Features with lower scores provide less information about the targetity.

#### Use Case in Feature Selection
Mutual information scores can be used to select a subset of features that are most relevant to the target variable. This can improve model performance by reducing overfitting and computational complexity.

In [15]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [16]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break        

In [17]:
X_fs = df[fs].values
##  X_fs_1 is to be checked without using the FCBF Filter
X_fs_1=df[fs].values

In [18]:
X_fs.shape

(56661, 45)

### Feature selection by Fast Correlation Based Filter (FCBF)

This is a Moudule imported from GitHub repo: https://github.com/SantiagoEG/FCBF_module, which is a custom Module

Certain features are redundant because they contain very similar information. FCBF can remove redundant features by calculating the correlation between each pair of features.

In [19]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBFK(k = 20)
#fcbf.fit(X_fs, y)

In [20]:
X_fss = fcbf.fit_transform(X_fs,y)

In [21]:
X_fss.shape

(56661, 20)

## Re-split train & test sets after feature selection


This is done including the FCBF Filtering

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_fss,y, train_size = 0.8, test_size = 0.2, random_state = 0,stratify = y)

In [23]:
X_train.shape

(45328, 20)

In [24]:
pd.Series(y_train).value_counts()

BENIGN          18184
DoS             15228
PortScan         6357
BruteForce       2213
WebAttack        1744
Bot              1573
Infiltration       29
Name: count, dtype: int64

### SMOTE to solve class-imbalance

In [25]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(n_jobs=-1,sampling_strategy={2:1000,4:1000})

In [27]:
unique_classes = np.unique(y_train)
print(f"Unique classes in y_train: {unique_classes}")

Unique classes in y_train: ['BENIGN' 'Bot' 'BruteForce' 'DoS' 'Infiltration' 'PortScan' 'WebAttack']


In [28]:
X_train, y_train = smote.fit_resample(X_train, y_train)

ValueError: The {2, 4} target class is/are not present in the data.

In [None]:
pd.Series(y_train).value_counts()

## Machine learning model training

### XGBoost with FCBF

In [None]:
xg = xgb.XGBClassifier(n_estimators = 10)
xg.fit(X_train,y_train)
xg_score=xg.score(X_test,y_test)
y_predict=xg.predict(X_test)
y_true=y_test
print('Accuracy of XGBoost: '+ str(xg_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of XGBoost: '+(str(precision)))
print('Recall of XGBoost: '+(str(recall)))
print('F1-score of XGBoost: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true,y_predict)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

#### Hyperparameter optimization (HPO) of XGBoost using Bayesian optimization with tree-based Parzen estimator (BO-TPE)
Based on the GitHub repo for HPO: https://github.com/LiYangHart/Hyperparameter-Optimization-of-Machine-Learning-Algorithms

In [None]:
import hyperopt

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']),
        'learning_rate':  abs(float(params['learning_rate'])),

    }
    clf = xgb.XGBClassifier( **params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)

    return {'loss':-score, 'status': STATUS_OK }

space = {
    'n_estimators': hp.quniform('n_estimators', 10, 100, 5),
    'max_depth': hp.quniform('max_depth', 4, 100, 1),
    'learning_rate': hp.normal('learning_rate', 0.01, 0.9),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("XGBoost: Hyperopt estimated optimum {}".format(best))

In [None]:
xg = xgb.XGBClassifier(learning_rate= 0.8391029526621202, n_estimators = 25, max_depth = 73)
xg.fit(X_train,y_train)
xg_score=xg.score(X_test,y_test)
y_predict=xg.predict(X_test)
y_true=y_test
print('Accuracy of XGBoost: '+ str(xg_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of XGBoost: '+(str(precision)))
print('Recall of XGBoost: '+(str(recall)))
print('F1-score of XGBoost: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true,y_predict)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
xg_train=xg.predict(X_train)
xg_test=xg.predict(X_test)

### XGBoost without FCBF

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_fs_1,y, train_size = 0.8, test_size = 0.2, random_state = 0,stratify = y)
pd.Series(y_train_1).value_counts()
smote=SMOTE(n_jobs=-1,sampling_strategy={2:1000,4:1000})
X_train_1, y_train_1 = smote.fit_resample(X_train_1, y_train_1)

In [None]:
xg = xgb.XGBClassifier(n_estimators = 10)
xg.fit(X_train_1,y_train_1)
xg_score=xg.score(X_test_1,y_test_1)
y_predict_1=xg.predict(X_test_1)
y_true_1=y_test_1
print('Accuracy of XGBoost: '+ str(xg_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true_1, y_predict_1, average='weighted') 
print('Precision of XGBoost: '+(str(precision)))
print('Recall of XGBoost: '+(str(recall)))
print('F1-score of XGBoost: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true_1,y_predict_1)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred_1")
plt.ylabel("y_true_1")
plt.show()

In [None]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']),
        'learning_rate':  abs(float(params['learning_rate'])),

    }
    clf = xgb.XGBClassifier( **params)
    clf.fit(X_train_1, y_train_1)
    y_pred_1 = clf.predict(X_test_1)
    score = accuracy_score(y_test_1, y_pred_1)

    return {'loss':-score, 'status': STATUS_OK }


space = {
    'n_estimators': hp.quniform('n_estimators', 10, 100, 5),
    'max_depth': hp.quniform('max_depth', 4, 100, 1),
    'learning_rate': hp.normal('learning_rate', 0.01, 0.9),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("XGBoost: Hyperopt estimated optimum {}".format(best))

In [None]:
xg = xgb.XGBClassifier(learning_rate= 0.1206862326628798, n_estimators = 55, max_depth = 75)
xg.fit(X_train_1,y_train_1)
xg_score=xg.score(X_test_1,y_test_1)
y_predict=xg.predict(X_test_1)
y_true_1=y_test_1
print('Accuracy of XGBoost: '+ str(xg_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true_1, y_predict_1, average='weighted') 
print('Precision of XGBoost: '+(str(precision)))
print('Recall of XGBoost: '+(str(recall)))
print('F1-score of XGBoost: '+(str(fscore)))
print(classification_report(y_true_1,y_predict_1))
cm=confusion_matrix(y_true_1,y_predict_1)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
xg_train_1=xg.predict(X_train_1)
xg_test_1=xg.predict(X_test_1)

### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_fss,y, train_size = 0.8, test_size = 0.2, random_state = 0,stratify = y)
smote=SMOTE(n_jobs=-1,sampling_strategy={2:1000,4:1000})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train,y_train) 
rf_score=rf.score(X_test,y_test)
y_predict=rf.predict(X_test)
y_true=y_test
print('Accuracy of RF: '+ str(rf_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of RF: '+(str(precision)))
print('Recall of RF: '+(str(recall)))
print('F1-score of RF: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true,y_predict)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
# Hyperparameter optimization of random forest
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Define the objective function
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']),
        'max_features': int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    clf = RandomForestClassifier( **params)
    clf.fit(X_train,y_train)
    score=clf.score(X_test,y_test)

    return {'loss':-score, 'status': STATUS_OK }
# Define the hyperparameter configuration space
space = {
    'n_estimators': hp.quniform('n_estimators', 10, 200, 1),
    'max_depth': hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 20, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['gini','entropy'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("Random Forest: Hyperopt estimated optimum {}".format(best))

In [None]:
rf_hpo = RandomForestClassifier(n_estimators = 27, min_samples_leaf = 1, max_depth = 45, min_samples_split = 2, max_features = 8, criterion = 'entropy')
rf_hpo.fit(X_train,y_train)
rf_score=rf_hpo.score(X_test,y_test)
y_predict=rf_hpo.predict(X_test)
y_true=y_test
print('Accuracy of RF: '+ str(rf_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of RF: '+(str(precision)))
print('Recall of RF: '+(str(recall)))
print('F1-score of RF: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true,y_predict)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
rf_train=rf_hpo.predict(X_train)
rf_test=rf_hpo.predict(X_test)

#### Random Forest without FCBF

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_fs_1,y, train_size = 0.8, test_size = 0.2, random_state = 0,stratify = y)
pd.Series(y_train_1).value_counts()
smote=SMOTE(n_jobs=-1,sampling_strategy={2:1000,4:1000})
X_train_1, y_train_1 = smote.fit_resample(X_train_1, y_train_1)

In [None]:
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train_1,y_train_1) 
rf_score=rf.score(X_test_1,y_test_1)
y_predict=rf.predict(X_test_1)
y_true_1=y_test_1
print('Accuracy of RF: '+ str(rf_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true_1, y_predict_1, average='weighted') 
print('Precision of RF: '+(str(precision)))
print('Recall of RF: '+(str(recall)))
print('F1-score of RF: '+(str(fscore)))
print(classification_report(y_true_1,y_predict_1))
cm=confusion_matrix(y_true_1,y_predict_1)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
# Hyperparameter optimization of random forest
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Define the objective function
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']),
        'max_features': int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    clf = RandomForestClassifier( **params)
    clf.fit(X_train_1,y_train_1)
    score=clf.score(X_test_1,y_test_1)

    return {'loss':-score, 'status': STATUS_OK }
# Define the hyperparameter configuration space
space = {
    'n_estimators': hp.quniform('n_estimators', 10, 200, 1),
    'max_depth': hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 20, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['gini','entropy'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("Random Forest: Hyperopt estimated optimum {}".format(best))

In [None]:
rf_hpo = RandomForestClassifier(n_estimators = 146, min_samples_leaf = 1, max_depth = 39, min_samples_split = 8, max_features = 15, criterion = 'entropy')
rf_hpo.fit(X_train_1,y_train_1)
rf_score=rf_hpo.score(X_test_1,y_test_1)
y_predict_1=rf_hpo.predict(X_test_1)
y_true_1=y_test_1
print('Accuracy of RF: '+ str(rf_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true_1, y_predict_1, average='weighted') 
print('Precision of RF: '+(str(precision)))
print('Recall of RF: '+(str(recall)))
print('F1-score of RF: '+(str(fscore)))
print(classification_report(y_true_1,y_predict_1))
cm=confusion_matrix(y_true_1,y_predict_1)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
rf_train_1=rf_hpo.predict(X_train_1)
rf_test_1=rf_hpo.predict(X_test_1)

### Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state = 0)
dt.fit(X_train,y_train) 
dt_score=dt.score(X_test,y_test)
y_predict=dt.predict(X_test)
y_true=y_test
print('Accuracy of DT: '+ str(dt_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of DT: '+(str(precision)))
print('Recall of DT: '+(str(recall)))
print('F1-score of DT: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true,y_predict)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
# Hyperparameter optimization of decision tree
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Define the objective function
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'max_features': int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    clf = DecisionTreeClassifier( **params)
    clf.fit(X_train,y_train)
    score=clf.score(X_test,y_test)

    return {'loss':-score, 'status': STATUS_OK }
# Define the hyperparameter configuration space
space = {
    'max_depth': hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 20, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['gini','entropy'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50)
print("Decision tree: Hyperopt estimated optimum {}".format(best))

In [None]:
dt_hpo = DecisionTreeClassifier(min_samples_leaf = 4, max_depth = 17, min_samples_split = 8, max_features = 20, criterion = 'entropy')
dt_hpo.fit(X_train,y_train)
dt_score=dt_hpo.score(X_test,y_test)
y_predict=dt_hpo.predict(X_test)
y_true=y_test
print('Accuracy of DT: '+ str(dt_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of DT: '+(str(precision)))
print('Recall of DT: '+(str(recall)))
print('F1-score of DT: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true,y_predict)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
dt_train=dt_hpo.predict(X_train)
dt_test=dt_hpo.predict(X_test)

#### DF without FCBF

In [None]:
dt = DecisionTreeClassifier(random_state = 0)
dt.fit(X_train_1,y_train_1) 
dt_score=dt.score(X_test_1,y_test_1)
y_predict_1=dt.predict(X_test_1)
y_true_1=y_test_1
print('Accuracy of DT: '+ str(dt_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true_1, y_predict_1, average='weighted') 
print('Precision of DT: '+(str(precision)))
print('Recall of DT: '+(str(recall)))
print('F1-score of DT: '+(str(fscore)))
print(classification_report(y_true_1,y_predict_1))
cm=confusion_matrix(y_true_1,y_predict_1)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
# Hyperparameter optimization of decision tree
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Define the objective function
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'max_features': int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    clf = DecisionTreeClassifier( **params)
    clf.fit(X_train_1,y_train_1)
    score=clf.score(X_test_1,y_test_1)

    return {'loss':-score, 'status': STATUS_OK }
# Define the hyperparameter configuration space
space = {
    'max_depth': hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 20, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['gini','entropy'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50)
print("Decision tree: Hyperopt estimated optimum {}".format(best))

In [None]:
dt_hpo = DecisionTreeClassifier(min_samples_leaf = 2, max_depth = 34, min_samples_split = 9, max_features = 12, criterion = 'entropy')
dt_hpo.fit(X_train_1,y_train_1)
dt_score=dt_hpo.score(X_test_1,y_test_1)
y_predict_1=dt_hpo.predict(X_test_1)
y_true_1=y_test_1
print('Accuracy of DT: '+ str(dt_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true_1, y_predict_1, average='weighted') 
print('Precision of DT: '+(str(precision)))
print('Recall of DT: '+(str(recall)))
print('F1-score of DT: '+(str(fscore)))
print(classification_report(y_true_1,y_predict_1))
cm=confusion_matrix(y_true_1,y_predict_1)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
dt_train_1=dt_hpo.predict(X_train_1)
dt_test_1=dt_hpo.predict(X_test_1)

### CNN

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

In [None]:
# Reshape the data for CNN input (assuming your data is not in image format)
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Create and compile CNN model
model_cnn = tf.keras.Sequential([
    tf.keras.layers.Conv1D(64, 3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN model
model_cnn.fit(X_train_cnn, y_train, epochs=10, validation_data=(X_test_cnn, y_test))

# Evaluate CNN model
cnn_score = model_cnn.evaluate(X_test_cnn, y_test, verbose=0)
print('Accuracy of CNN: ' + str(cnn_score[1]))

# Predict using the model
y_predict_cnn = (model_cnn.predict(X_test_cnn) > 0.5).astype("int32")

# Calculate precision, recall, and F1-score
precision_cnn, recall_cnn, fscore_cnn, _ = precision_recall_fscore_support(y_test, y_predict_cnn, average='weighted')
print('Precision of CNN: ' + str(precision_cnn))
print('Recall of CNN: ' + str(recall_cnn))
print('F1-score of CNN: ' + str(fscore_cnn))

# Print classification report
print(classification_report(y_test, y_predict_cnn))

# Confusion matrix
cm_cnn = confusion_matrix(y_test, y_predict_cnn)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_cnn, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()


In [None]:
# Define the objective function for CNN
def cnn_objective(params):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(filters=int(params['filters']), kernel_size=int(params['kernel_size']), activation='relu', input_shape=(X_train.shape[1], 1)),
        tf.keras.layers.MaxPooling1D(pool_size=int(params['pool_size'])),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(int(params['dense_units']), activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_cnn, y_train, epochs=int(params['epochs']), validation_data=(X_test_cnn, y_test), verbose=0)
    _, accuracy = model.evaluate(X_test_cnn, y_test, verbose=0)
    return {'loss': -accuracy, 'status': STATUS_OK}

# Define the hyperparameter configuration space for CNN
cnn_space = {
    'filters': hp.quniform('filters', 32, 128, 1),
    'kernel_size': hp.quniform('kernel_size', 3, 10, 1),
    'pool_size': hp.quniform('pool_size', 2, 5, 1),
    'dense_units': hp.quniform('dense_units', 32, 128, 1),
    'epochs': hp.quniform('epochs', 5, 20, 1)
}

# Perform hyperparameter optimization for CNN
cnn_trials = Trials()
best_cnn = fmin(fn=cnn_objective, space=cnn_space, algo=tpe.suggest, max_evals=50, trials=cnn_trials)

print("CNN: Hyperopt estimated optimum {}".format(best_cnn))

# Extract the best hyperparameters for CNN
best_filters = int(best_cnn['filters'])
best_kernel_size = int(best_cnn['kernel_size'])
best_pool_size = int(best_cnn['pool_size'])
best_dense_units = int(best_cnn['dense_units'])
best_epochs = int(best_cnn['epochs'])

# Create and train CNN model with best hyperparameters
cnn_hpo = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=best_filters, kernel_size=best_kernel_size, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    tf.keras.layers.MaxPooling1D(pool_size=best_pool_size),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(best_dense_units, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
cnn_hpo.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_hpo.fit(X_train_cnn, y_train, epochs=best_epochs, validation_data=(X_test_cnn, y_test))

# Evaluate CNN model with best hyperparameters
cnn_score = cnn_hpo.evaluate(X_test_cnn, y_test, verbose=0)
print('Accuracy of CNN with best hyperparameters: ' + str(cnn_score[1]))

# Predict using the CNN model with best hyperparameters
y_predict_cnn = (cnn_hpo.predict(X_test_cnn) > 0.5).astype("int32")

# Calculate precision, recall, and F1-score for CNN model
precision_cnn, recall_cnn, fscore_cnn, _ = precision_recall_fscore_support(y_test, y_predict_cnn, average='weighted')
print('Precision of CNN with best hyperparameters: ' + str(precision_cnn))
print('Recall of CNN with best hyperparameters: ' + str(recall_cnn))
print('F1-score of CNN with best hyperparameters: ' + str(fscore_cnn))

# Print classification report for CNN model
print(classification_report(y_test, y_predict_cnn))

# Confusion matrix for CNN model
cm_cnn = confusion_matrix(y_test, y_predict_cnn)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_cnn, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
cnn_train=cnn_hpo.predict(X_train_cnn)
cnn_test=cnn_hpo.predict(X_test_cnn)

#### CNN without FCBF

In [None]:
# Reshape the data for CNN input (assuming your data is not in image format)
X_train_cnn_1 = X_train_1.reshape(X_train_1.shape[0], X_train_1.shape[1], 1)
X_test_cnn_1 = X_test_1.reshape(X_test_1.shape[0], X_test_1.shape[1], 1)

# Create and compile CNN model
model_cnn = tf.keras.Sequential([
    tf.keras.layers.Conv1D(64, 3, activation='relu', input_shape=(X_train_cnn_1.shape[1], 1)),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN model
model_cnn.fit(X_train_cnn_1, y_train_1, epochs=10, validation_data=(X_test_cnn_1, y_test_1))

# Evaluate CNN model
cnn_score = model_cnn.evaluate(X_test_cnn_1, y_test_1, verbose=0)
print('Accuracy of CNN: ' + str(cnn_score[1]))

# Predict using the model
y_predict_cnn_1 = (model_cnn.predict(X_test_cnn_1) > 0.5).astype("int32")

# Calculate precision, recall, and F1-score
precision_cnn, recall_cnn, fscore_cnn, _ = precision_recall_fscore_support(y_test_1, y_predict_cnn_1, average='weighted')
print('Precision of CNN: ' + str(precision_cnn))
print('Recall of CNN: ' + str(recall_cnn))
print('F1-score of CNN: ' + str(fscore_cnn))

# Print classification report
print(classification_report(y_test_1, y_predict_cnn_1))

# Confusion matrix
cm_cnn = confusion_matrix(y_test_1, y_predict_cnn_1)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_cnn, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()


In [None]:
# Define the objective function for CNN
def cnn_objective(params):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(filters=int(params['filters']), kernel_size=int(params['kernel_size']), activation='relu', input_shape=(X_train_1.shape[1], 1)),
        tf.keras.layers.MaxPooling1D(pool_size=int(params['pool_size'])),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(int(params['dense_units']), activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_cnn_1, y_train_1, epochs=int(params['epochs']), validation_data=(X_test_cnn_1, y_test_1), verbose=0)
    _, accuracy = model.evaluate(X_test_cnn_1, y_test_1, verbose=0)
    return {'loss': -accuracy, 'status': STATUS_OK}

# Define the hyperparameter configuration space for CNN
cnn_space = {
    'filters': hp.quniform('filters', 32, 128, 1),
    'kernel_size': hp.quniform('kernel_size', 3, 10, 1),
    'pool_size': hp.quniform('pool_size', 2, 5, 1),
    'dense_units': hp.quniform('dense_units', 32, 128, 1),
    'epochs': hp.quniform('epochs', 5, 20, 1)
}

# Perform hyperparameter optimization for CNN
cnn_trials = Trials()
best_cnn = fmin(fn=cnn_objective, space=cnn_space, algo=tpe.suggest, max_evals=50, trials=cnn_trials)

print("CNN: Hyperopt estimated optimum {}".format(best_cnn))

# Extract the best hyperparameters for CNN
best_filters = int(best_cnn['filters'])
best_kernel_size = int(best_cnn['kernel_size'])
best_pool_size = int(best_cnn['pool_size'])
best_dense_units = int(best_cnn['dense_units'])
best_epochs = int(best_cnn['epochs'])

# Create and train CNN model with best hyperparameters
cnn_hpo = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=best_filters, kernel_size=best_kernel_size, activation='relu', input_shape=(X_train_cnn_1.shape[1], 1)),
    tf.keras.layers.MaxPooling1D(pool_size=best_pool_size),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(best_dense_units, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
cnn_hpo.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_hpo.fit(X_train_cnn_1, y_train_1, epochs=best_epochs, validation_data=(X_test_cnn_1, y_test_1))

# Evaluate CNN model with best hyperparameters
cnn_score = cnn_hpo.evaluate(X_test_cnn_1, y_test_1, verbose=0)
print('Accuracy of CNN with best hyperparameters: ' + str(cnn_score[1]))

# Predict using the CNN model with best hyperparameters
y_predict_cnn_1 = (cnn_hpo.predict(X_test_cnn_1) > 0.5).astype("int32")

# Calculate precision, recall, and F1-score for CNN model
precision_cnn, recall_cnn, fscore_cnn, _ = precision_recall_fscore_support(y_test_1, y_predict_cnn_1, average='weighted')
print('Precision of CNN with best hyperparameters: ' + str(precision_cnn))
print('Recall of CNN with best hyperparameters: ' + str(recall_cnn))
print('F1-score of CNN with best hyperparameters: ' + str(fscore_cnn))

# Print classification report for CNN model
print(classification_report(y_test_1, y_predict_cnn_1))

# Confusion matrix for CNN model
cm_cnn = confusion_matrix(y_test_1, y_predict_cnn_1)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_cnn, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
cnn_train_1=cnn_hpo.predict(X_train_cnn_1)
cnn_test_1=cnn_hpo.predict(X_test_cnn_1)

### LSTM

In [None]:

model_lstm = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(X_train.shape[1], 1)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train LSTM model
model_lstm.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Evaluate LSTM model
lstm_score = model_lstm.evaluate(X_test, y_test, verbose=0)
print('Accuracy of LSTM: ' + str(lstm_score[1]))

# Predict using the model
y_predict_lstm = (model_lstm.predict(X_test) > 0.5).astype("int32")

# Calculate precision, recall, and F1-score
precision_lstm, recall_lstm, fscore_lstm, _ = precision_recall_fscore_support(y_test, y_predict_lstm, average='weighted')
print('Precision of LSTM: ' + str(precision_lstm))
print('Recall of LSTM: ' + str(recall_lstm))
print('F1-score of LSTM: ' + str(fscore_lstm))

# Print classification report
print(classification_report(y_test, y_predict_lstm))

# Confusion matrix
cm_lstm = confusion_matrix(y_test, y_predict_lstm)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_lstm, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()


In [None]:
# Define the objective function for LSTM
def lstm_objective(params):
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(units=int(params['units']), input_shape=(X_train.shape[1], 1)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=int(params['epochs']), validation_data=(X_test, y_test), verbose=0)
    _, accuracy = model.evaluate(X_test, y_test, verbose=0)
    return {'loss': -accuracy, 'status': STATUS_OK}

# Define the hyperparameter configuration space for LSTM
lstm_space = {
    'units': hp.quniform('units', 32, 128, 1),
    'epochs': hp.quniform('epochs', 5, 20, 1)
}

# Perform hyperparameter optimization for LSTM
lstm_trials = Trials()
best_lstm = fmin(fn=lstm_objective, space=lstm_space, algo=tpe.suggest, max_evals=50, trials=lstm_trials)

print("LSTM: Hyperopt estimated optimum {}".format(best_lstm))

# Extract the best hyperparameters for LSTM
best_units = int(best_lstm['units'])
best_epochs = int(best_lstm['epochs'])

# Create and train LSTM model with best hyperparameters
lstm_model = tf.keras.Sequential([
    tf.keras.layers.LSTM(units=best_units, input_shape=(X_train.shape[1], 1)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=best_epochs, validation_data=(X_test, y_test))

# Evaluate LSTM model with best hyperparameters
lstm_score = lstm_model.evaluate(X_test, y_test, verbose=0)
print('Accuracy of LSTM with best hyperparameters: ' + str(lstm_score[1]))

# Predict using the LSTM model with best hyperparameters
y_predict_lstm = (lstm_model.predict(X_test) > 0.5).astype("int32")

# Calculate precision, recall, and F1-score for LSTM model
precision_lstm, recall_lstm, fscore_lstm, _ = precision_recall_fscore_support(y_test, y_predict_lstm, average='weighted')
print('Precision of LSTM with best hyperparameters: ' + str(precision_lstm))
print('Recall of LSTM with best hyperparameters: ' + str(recall_lstm))
print('F1-score of LSTM with best hyperparameters: ' + str(fscore_lstm))

# Print classification report for LSTM model
print(classification_report(y_test, y_predict_lstm))

# Confusion matrix for LSTM model
cm_lstm = confusion_matrix(y_test, y_predict_lstm)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_lstm, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()


In [None]:
lstm_train=lstm_model.predict(X_train)
lstm_test=lstm_model.predict(X_test)

#### LSTM without FCBF

In [None]:

model_lstm = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(X_train.shape[1], 1)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train LSTM model
model_lstm.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Evaluate LSTM model
lstm_score = model_lstm.evaluate(X_test, y_test, verbose=0)
print('Accuracy of LSTM: ' + str(lstm_score[1]))

# Predict using the model
y_predict_lstm = (model_lstm.predict(X_test) > 0.5).astype("int32")

# Calculate precision, recall, and F1-score
precision_lstm, recall_lstm, fscore_lstm, _ = precision_recall_fscore_support(y_test, y_predict_lstm, average='weighted')
print('Precision of LSTM: ' + str(precision_lstm))
print('Recall of LSTM: ' + str(recall_lstm))
print('F1-score of LSTM: ' + str(fscore_lstm))

# Print classification report
print(classification_report(y_test, y_predict_lstm))

# Confusion matrix
cm_lstm = confusion_matrix(y_test, y_predict_lstm)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_lstm, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()


### SVM

In [None]:
from sklearn.svm import SVC

svm = SVC(random_state=0)
svm.fit(X_train, y_train)

# Evaluate SVM classifier
svm_score = svm.score(X_test, y_test)
print('Accuracy of SVM: ' + str(svm_score))

# Predict using SVM classifier
y_predict = svm.predict(X_test)
y_true = y_test

# Calculate precision, recall, and F1-score for SVM classifier
precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_predict, average='weighted')
print('Precision of SVM: ' + str(precision))
print('Recall of SVM: ' + str(recall))
print('F1-score of SVM: ' + str(fscore))

# Print classification report for SVM classifier
print(classification_report(y_true, y_predict))

# Confusion matrix for SVM classifier
cm = confusion_matrix(y_true, y_predict)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
# Define the objective function for SVM
def svm_objective(params):
    params = {
        'C': float(params['C']),
        'kernel': str(params['kernel']),
        'gamma': str(params['gamma']) if params['kernel'] == 'rbf' else 'scale',  # gamma only for 'rbf' kernel
        'degree': int(params['degree']) if params['kernel'] == 'poly' else 3,  # degree only for 'poly' kernel
    }
    clf = SVC(**params, random_state=0)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    return {'loss': -score, 'status': STATUS_OK}

# Define the hyperparameter configuration space for SVM
svm_space = {
    'C': hp.loguniform('C', np.log(1e-6), np.log(1e+3)),
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'gamma': hp.choice('gamma', ['scale', 'auto']),
    'degree': hp.quniform('degree', 2, 5, 1)
}

# Perform hyperparameter optimization for SVM
svm_trials = Trials()
best_svm = fmin(fn=svm_objective, space=svm_space, algo=tpe.suggest, max_evals=50, trials=svm_trials)

print("SVM: Hyperopt estimated optimum {}".format(best_svm))

# Extract the best hyperparameters for SVM
best_svm_params = {
    'C': best_svm['C'],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'][best_svm['kernel']],
    'gamma': ['scale', 'auto'][best_svm['gamma']],
    'degree': int(best_svm['degree']) if 'degree' in best_svm else 3
}

# Create and train SVM model with best hyperparameters
svm_hpo = SVC(**best_svm_params, random_state=0)
svm_hpo.fit(X_train, y_train)
svm_score = svm_hpo.score(X_test, y_test)
print('Accuracy of SVM with best hyperparameters: ' + str(svm_score))

# Predict using the SVM model with best hyperparameters
y_predict_svm = svm_hpo.predict(X_test)
y_true = y_test

# Calculate precision, recall, and F1-score for SVM model
precision_svm, recall_svm, fscore_svm, _ = precision_recall_fscore_support(y_true, y_predict_svm, average='weighted')
print('Precision of SVM with best hyperparameters: ' + str(precision_svm))
print('Recall of SVM with best hyperparameters: ' + str(recall_svm))
print('F1-score of SVM with best hyperparameters: ' + str(fscore_svm))

# Print classification report for SVM model
print(classification_report(y_true, y_predict_svm))

# Confusion matrix for SVM model
cm_svm = confusion_matrix(y_true, y_predict_svm)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_svm, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
svm_train=svm_hpo.predict(X_train)
svm_test=svm_hpo.predict(X_test)

### Apply Stacking
The ensemble model that combines the four ML models (DT, RF, CNN, LSTM, XGBoost)

In [None]:
base_predictions_train = pd.DataFrame( {
    'DecisionTree': dt_train.ravel(),
        'RandomForest': rf_train.ravel(),
     'CNN': cnn_train.ravel(),
    'LSTM': lstm_train.ravel(),
    'SVM':svm_train.ravel(),
     'XgBoost': xg_train.ravel(),
    })
base_predictions_train.head(5)

In [None]:
dt_train=dt_train.reshape(-1, 1)
cnn_train=cnn_train.reshape(-1, 1)
lstm_train=lstm_train.reshape(-1, 1)
rf_train=rf_train.reshape(-1, 1)
xg_train=xg_train.reshape(-1, 1)
svm_train=svm_train.reshape(-1,1)

dt_test=dt_test.reshape(-1, 1)
cnn_test=cnn_test.reshape(-1, 1)
lstm_test=lstm_test.reshape(-1, 1)
rf_test=rf_test.reshape(-1, 1)
xg_test=xg_test.reshape(-1, 1)
svm_test=svm_test.reshape(-1,1)


In [None]:
dt_train.shape

In [None]:
x_train = np.concatenate(( dt_train, cnn_train, lstm_train, rf_train, xg_train, svm_train), axis=1)
x_test = np.concatenate(( dt_test, cnn_test, lstm_test, rf_test, xg_test, svm_test), axis=1)

In [None]:
stk = xgb.XGBClassifier().fit(x_train, y_train)
y_predict=stk.predict(x_test)
y_true=y_test
stk_score=accuracy_score(y_true,y_predict)
print('Accuracy of Stacking: '+ str(stk_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of Stacking: '+(str(precision)))
print('Recall of Stacking: '+(str(recall)))
print('F1-score of Stacking: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true,y_predict)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

#### Hyperparameter optimization (HPO) of the stacking ensemble model (XGBoost) using Bayesian optimization with tree-based Parzen estimator (BO-TPE)
Based on the GitHub repo for HPO: https://github.com/LiYangHart/Hyperparameter-Optimization-of-Machine-Learning-Algorithms

In [None]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']),
        'learning_rate':  abs(float(params['learning_rate'])),

    }
    clf = xgb.XGBClassifier( **params)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    score = accuracy_score(y_test, y_pred)

    return {'loss':-score, 'status': STATUS_OK }

space = {
    'n_estimators': hp.quniform('n_estimators', 10, 100, 5),
    'max_depth': hp.quniform('max_depth', 4, 100, 1),
    'learning_rate': hp.normal('learning_rate', 0.01, 0.9),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("XGBoost: Hyperopt estimated optimum {}".format(best))

In [None]:
xg = xgb.XGBClassifier(learning_rate= 0.22000784172473817, n_estimators = 35, max_depth = 49)
xg.fit(x_train,y_train)
xg_score=xg.score(x_test,y_test)
y_predict=xg.predict(x_test)
y_true=y_test
print('Accuracy of XGBoost: '+ str(xg_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of XGBoost: '+(str(precision)))
print('Recall of XGBoost: '+(str(recall)))
print('F1-score of XGBoost: '+(str(fscore)))
print(classification_report(y_true,y_predict))
cm=confusion_matrix(y_true,y_predict)
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

## Anomaly Based Detection

### Generate the port-scan datasets for unknown attack detection

In [None]:
df2=pd.read_csv('./data/CICIDS2017_sample_km.csv')

In [None]:
df2.Label.value_counts()

In [None]:
df3 = df2[df2['Label'] != 5]
df3['Label'][df3['Label'] > 0] = 1
df3.to_csv('./data/CICIDS2017_sample_km_without_portscan.csv',index=0)

In [None]:
df4 = df2[df2['Label'] == 5]
df4['Label'][df4['Label'] == 5] = 1
df4.to_csv('./data/CICIDS2017_sample_km_portscan.csv',index=0)

### Read the generated datasets for unknown attack detection

In [None]:
df3 = pd.read_csv('./data/CICIDS2017_sample_km_without_portscan.csv')
df4 = pd.read_csv('./data/CICIDS2017_sample_km_portscan.csv')

In [None]:
features = df3.drop(['Label'],axis=1).dtypes[df3.dtypes != 'object'].index
df3[features] = df3[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df4[features] = df4[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df3 = df3.fillna(0)
df4 = df4.fillna(0)

In [None]:
df3.Label.value_counts()

In [None]:
df4.Label.value_counts()

##### Aborting Anomaly Based here due to low sample size for sampled dataset!!!!!!