In [3]:
import pandas as pd
import numpy as np

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Reading Data

In [5]:
df = pd.read_csv('../../Dataset/NSL_new.csv')

In [6]:
df = df.drop( df[ (df.label != "normal") & (df.label != "neptune") & (df.label != "back") & (df.label != "land") & (df.label != "pod") & (df.label != "smurf") & (df.label != "teardrop") & (df.label != "teardrop") & (df.label != "mailbomb") & (df.label != "apache2") & (df.label != "processtable") & (df.label != "udpstorm") & (df.label != "worm")].index )
print(df.shape)

(113270, 123)


In [7]:
df = df.sample(n=20000)

In [8]:
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label
121227,0,306,1075,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal
27715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune
39292,0,105,146,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
39487,0,516,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
54680,0,178,748,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal


In [9]:
X = df.drop(['label'], axis=1)
y = df['label']

In [10]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [11]:
y = y.replace( to_replace =  'normal' , value = 0 )

y = y.replace( to_replace =  'neptune' , value = 1 )
y = y.replace( to_replace =  'back' , value = 1 )
y = y.replace( to_replace =  'land' , value = 1 )
y = y.replace( to_replace =  'pod' , value = 1 )
y = y.replace( to_replace =  'smurf' , value = 1 )
y = y.replace( to_replace =  'teardrop' , value = 1 )
y = y.replace( to_replace =  'mailbomb' , value = 1 )
y = y.replace( to_replace =  'apache2' , value = 1 )
y = y.replace( to_replace =  'processtable' , value = 1 )
y = y.replace( to_replace =  'udpstorm' , value = 1 )
y = y.replace( to_replace =  'worm' , value = 1 )

 # correlated features

In [25]:
correlation_matrix = df.corr()
correlated_features = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
print(correlated_features)

{'num_root', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'protocol_type_udp', 'srv_rerror_rate', 'service_domain_u', 'service_ecr_i', 'dst_host_rerror_rate', 'flag_SF', 'srv_serror_rate', 'same_srv_rate', 'dst_host_srv_rerror_rate', 'flag_S0', 'flag_REJ', 'is_guest_login', 'dst_host_srv_serror_rate'}


In [20]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# (KNN-RF-ADA)_(DT-MLP)_MLP

In [21]:
# Create Learners per layer
layer_one_estimators = [
                        ('knn_1', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, 
                                                        min_samples_split=0.1)),
                        ('ada_1', AdaBoostClassifier(random_state=0, learning_rate=0.1, n_estimators=1000))
                       ]

layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10)),
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20)),
                       ]

In [22]:
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=MLPClassifier())

In [23]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

clf =  RFECV(estimator=clf, step=1, cv=StratifiedKFold(10), scoring='accuracy')
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)

RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes

In [108]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(KNN-RF-ADA)_(DT-MLP)_MLP_ChiSquared.csv')

# (KNN-RF)_(DT-MLP)_MLP

In [109]:
# Create Learners per layer
layer_one_estimators = [
                        ('knn_1', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, 
                                                        min_samples_split=0.1))
                       ]

layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10)),
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20)),
                       ]
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=MLPClassifier())

In [110]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)

  Accuracy Precision   Recall
0   0.9954  0.993652  0.99511


In [111]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(KNN-RF)_(DT-MLP)_MLP_ChiSquared.csv')

# (RF-MLP)_(DT-KNN)_MLP

In [112]:
# Create Learners per layer
layer_one_estimators = [
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, min_samples_split=0.1)),
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20))
                       ]

layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10)),
                        ('knn_1', KNeighborsClassifier(n_neighbors=5, n_jobs=-1))
                       ]
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=MLPClassifier())

In [113]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)



  Accuracy Precision    Recall
0     0.99  0.988731  0.986797


In [114]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(RF-MLP)_(DT-KNN)_MLP_ChiSquared.csv')

# (KNN-RF)_(DT-MLP)_DT

In [115]:
# Create Learners per layer
layer_one_estimators = [
                        ('knn_1', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, 
                                                        min_samples_split=0.1))
                       ]

layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10)),
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20)),
                       ]

In [116]:
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=DecisionTreeClassifier())

In [117]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)

  Accuracy Precision    Recall
0   0.9918  0.994083  0.985819


In [118]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(KNN-RF)_(DT-MLP)_DT_ChiSquared.csv')

# (ADA-MLP)_(DT-RF)_MLP

In [119]:
# Create Learners per layer
layer_one_estimators = [
                        ('ada_1', AdaBoostClassifier(random_state=0, learning_rate=0.1, n_estimators=1000)),
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20))
                       ]

layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10)),
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, 
                                                        min_samples_split=0.1))
                       ]
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=MLPClassifier())

In [120]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)



  Accuracy Precision    Recall
0    0.996  0.993662  0.996577


In [121]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(ADA-MLP)_(DT-RF)_MLP_ChiSquared.csv')

# (KNN-RF)_(DT-MLP)_DT

In [122]:
# Create Learners per layer
layer_one_estimators = [
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, 
                                                        min_samples_split=0.1)),
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20))
                       ]

layer_two_estimators = [
                        ('knn_1', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10))
                       ]

In [123]:
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=DecisionTreeClassifier())

In [124]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)



  Accuracy Precision   Recall
0   0.9882  0.990613  0.98044


In [125]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(MLP-RF)_(DT-KNN)_DT_ChiSquared.csv')

# (KNN-RF)_(DT-MLP)_DT

In [126]:
# Create Learners per layer
layer_one_estimators = [
                        ('knn_1', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, 
                                                        min_samples_split=0.1))
                       ]

layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10)),
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20)),
                       ]

In [127]:
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=KNeighborsClassifier())

In [128]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)

  Accuracy Precision    Recall
0   0.9948  0.994126  0.993154


In [129]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(KNN-RF)_(DT-MLP)_KNN_ChiSquared.csv')

# (KNN-RF)_(DT-MLP)_DT

In [130]:
# Create Learners per layer
layer_one_estimators = [
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20)),
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, 
                                                        min_samples_split=0.1))
                       ]

layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10)),
                        ('knn_1', KNeighborsClassifier(n_neighbors=5, n_jobs=-1))
                       ]

In [131]:
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=KNeighborsClassifier())

In [132]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)

  Accuracy Precision    Recall
0    0.989  0.991601  0.981418


In [133]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(MLP-RF)_(DT-KNN)_KNN_ChiSquared.csv')

# (KNN-RF)_(DT-MLP)_DT

In [134]:
# Create Learners per layer
layer_one_estimators = [
                        ('knn_1', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
                        ('mlp_2', MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20))
                       ]

layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier(max_depth=13, min_samples_split=10)),
                        ('rn_1', RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, 
                                                        min_samples_split=0.1))
                       ]

In [135]:
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=KNeighborsClassifier())

In [136]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acurcy = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recal = recall_score(y_test, y_pred)

clf_Score = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall'])
clf_Score.at[0, 'Accuracy'] = acurcy
clf_Score.at[0, 'Precision'] = prec
clf_Score.at[0, 'Recall'] = recal
print(clf_Score)



  Accuracy Precision    Recall
0   0.9944  0.994121  0.992176


In [137]:
clf_Score.to_csv('NSL_DOS_Stacking_level2_(KNN-MLP)_(DT-RF)_KNN_ChiSquared.csv')