In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

In [7]:
#Read dataset
df = pd.read_csv('./Downloads/df_all_data_preprocessed_v9_0.01_noise_40%.csv', encoding='utf-8')
# The results in this code is based on the original CICIDS2017 dataset. Please go to cell [10] if you work on the sampled dataset. 

In [8]:
# Randomly sample instances from majority classes
df_minor = df[(df['Label']=='WebAttack')|(df['Label']=='Bot')|(df['Label']=='Infiltration')]
df_BENIGN = df[(df['Label']=='BENIGN')]
df_BENIGN = df_BENIGN.sample(n=None, frac=0.01, replace=False, weights=None, random_state=None, axis=0)
df_DoS = df[(df['Label']=='DoS')]
df_DoS = df_DoS.sample(n=None, frac=0.05, replace=False, weights=None, random_state=None, axis=0)
df_PortScan = df[(df['Label']=='PortScan')]
df_PortScan = df_PortScan.sample(n=None, frac=0.05, replace=False, weights=None, random_state=None, axis=0)
df_BruteForce = df[(df['Label']=='BruteForce')]
df_BruteForce = df_BruteForce.sample(n=None, frac=0.2, replace=False, weights=None, random_state=None, axis=0)

In [9]:
df_s = df_BENIGN.append(df_DoS).append(df_PortScan).append(df_BruteForce).append(df_minor)

In [10]:
df_s = df_s.sort_index()

In [11]:
# Save the sampled dataset
df_s.to_csv('./Downloads/CICIDS2017_sample1.csv',index=0)

In [12]:
# Min-max normalization
numeric_features = df.dtypes[df.dtypes != 'object'].index
df[numeric_features] = df[numeric_features].apply(
    lambda x: (x - x.min()) / (x.max()-x.min()))
# Fill empty values by 0
df = df.fillna(0)

In [13]:
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])
X = df.drop(['Label'],axis=1).values 
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state = None,stratify = y)

In [14]:
train_accuracy = []
importance = []

In [15]:
dt = DecisionTreeClassifier(random_state = None)
dt.fit(X_train,y_train) 
dt_score=dt.score(X_test,y_test)
train_accuracy.append(dt_score)
print('Accuracy of DT: '+ str(dt_score))

Accuracy of DT: 0.3008849557522124


In [16]:
train_accuracy

[0.3008849557522124]

In [17]:
rf = RandomForestClassifier(random_state = None)
rf.fit(X_train,y_train) 
rf_score=rf.score(X_test,y_test)
train_accuracy.append(rf_score)
print('Accuracy of RF: '+ str(rf_score))

Accuracy of RF: 0.40707964601769914


In [18]:
et = ExtraTreesClassifier(random_state = None)
et.fit(X_train,y_train) 
et_score=et.score(X_test,y_test)
train_accuracy.append(et_score)
print('Accuracy of ET: '+ str(et_score))


Accuracy of ET: 0.415929203539823


In [19]:
xg = xgb.XGBClassifier(n_estimators = 10, random_state = None)
xg.fit(X_train,y_train)
xg_score=xg.score(X_test,y_test)
train_accuracy.append(xg_score)
print('Accuracy of XGBoost: '+ str(xg_score))

Accuracy of XGBoost: 0.39823008849557523


In [20]:
train_accuracy

[0.3008849557522124,
 0.40707964601769914,
 0.415929203539823,
 0.39823008849557523]

In [21]:
max(train_accuracy)

0.415929203539823

In [22]:
Max = train_accuracy.index(max(train_accuracy))

In [23]:
Max

2

In [24]:
dt_feature = dt.feature_importances_
importance.append(dt_feature)
rf_feature = rf.feature_importances_
importance.append(rf_feature)
et_feature = et.feature_importances_
importance.append(et_feature)
xgb_feature = xg.feature_importances_
importance.append(xgb_feature)

In [25]:
dt_feature

array([0.02767636, 0.01928462, 0.00764435, 0.00963662, 0.01766069,
       0.05700822, 0.00859989, 0.00808117, 0.03293729, 0.02113873,
       0.        , 0.        , 0.04830086, 0.06221928, 0.        ,
       0.03094558, 0.01866495, 0.02353871, 0.03959045, 0.01717744,
       0.00606336, 0.00767343, 0.00922022, 0.01665376, 0.00573326,
       0.01155561, 0.02145195, 0.00521206, 0.01346996, 0.00286663,
       0.        , 0.        , 0.        , 0.00286663, 0.00687991,
       0.09030984, 0.04120507, 0.00778552, 0.00831323, 0.0048489 ,
       0.00955543, 0.0197479 , 0.00429995, 0.        , 0.        ,
       0.        , 0.02291156, 0.00286663, 0.        , 0.        ,
       0.        , 0.02007076, 0.00382217, 0.        , 0.0282316 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.06142262, 0.        , 0.0009923 ,
       0.06245266, 0.00402693, 0.00999981, 0.        , 0.00846339,
       0.        , 0.01516194, 0.00382217, 0.        , 0.00458

In [26]:
rf_feature

array([0.03177823, 0.012114  , 0.00969555, 0.01948033, 0.01302392,
       0.01837288, 0.007145  , 0.018066  , 0.00881191, 0.01988771,
       0.00982021, 0.01581772, 0.01694352, 0.03713032, 0.        ,
       0.04129263, 0.02383757, 0.03314341, 0.03120365, 0.02091995,
       0.02073462, 0.01992776, 0.02112428, 0.01644398, 0.01168892,
       0.0132823 , 0.01193838, 0.01428605, 0.00916294, 0.00096316,
       0.        , 0.        , 0.        , 0.015871  , 0.01219641,
       0.05628424, 0.04444723, 0.01013623, 0.01843412, 0.02171317,
       0.01965719, 0.02269024, 0.00146382, 0.00092812, 0.        ,
       0.00466503, 0.00573528, 0.00365133, 0.        , 0.        ,
       0.00266889, 0.02116458, 0.01916959, 0.01475491, 0.01517185,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0108819 , 0.01772987, 0.00828523, 0.01435587,
       0.02981721, 0.01415361, 0.01251175, 0.00688182, 0.01004138,
       0.00197868, 0.00599333, 0.00768896, 0.0060944 , 0.00191

In [27]:
et_feature

array([0.03293305, 0.0107295 , 0.00952691, 0.01585965, 0.01095097,
       0.01643299, 0.00938598, 0.01370834, 0.00916345, 0.01765375,
       0.00784536, 0.01741912, 0.01279351, 0.040648  , 0.        ,
       0.04126125, 0.02594856, 0.03452893, 0.03846678, 0.02207903,
       0.01994398, 0.01740037, 0.0222661 , 0.01897143, 0.01410815,
       0.01164299, 0.01148841, 0.01196362, 0.01123502, 0.00157063,
       0.        , 0.        , 0.        , 0.0143965 , 0.01485957,
       0.05797385, 0.04277541, 0.01210801, 0.01744637, 0.01773521,
       0.01511438, 0.01538623, 0.00195696, 0.00186836, 0.        ,
       0.01024616, 0.01165057, 0.00438816, 0.        , 0.        ,
       0.0102031 , 0.0181855 , 0.01495315, 0.01745683, 0.01380961,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0121931 , 0.01451734, 0.01050474, 0.01088544,
       0.0273284 , 0.01262056, 0.01228421, 0.01345375, 0.00610833,
       0.00169044, 0.00751617, 0.00607494, 0.00722995, 0.00170

In [28]:
importance

[array([0.02767636, 0.01928462, 0.00764435, 0.00963662, 0.01766069,
        0.05700822, 0.00859989, 0.00808117, 0.03293729, 0.02113873,
        0.        , 0.        , 0.04830086, 0.06221928, 0.        ,
        0.03094558, 0.01866495, 0.02353871, 0.03959045, 0.01717744,
        0.00606336, 0.00767343, 0.00922022, 0.01665376, 0.00573326,
        0.01155561, 0.02145195, 0.00521206, 0.01346996, 0.00286663,
        0.        , 0.        , 0.        , 0.00286663, 0.00687991,
        0.09030984, 0.04120507, 0.00778552, 0.00831323, 0.0048489 ,
        0.00955543, 0.0197479 , 0.00429995, 0.        , 0.        ,
        0.        , 0.02291156, 0.00286663, 0.        , 0.        ,
        0.        , 0.02007076, 0.00382217, 0.        , 0.0282316 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.06142262, 0.        , 0.0009923 ,
        0.06245266, 0.00402693, 0.00999981, 0.        , 0.00846339,
        0.        , 0.01516194, 0.00382217, 0.  

In [29]:
feature=(df.drop(['Label'],axis=1)).columns.values
    
f_list = sorted(zip(map(lambda x: round(x, 4),importance[Max]), feature), reverse=True)

Sum = 0       
fs = []
for i in range(6):
    fs.append(f_list[i][1])

In [30]:
importance[Max]

array([0.03293305, 0.0107295 , 0.00952691, 0.01585965, 0.01095097,
       0.01643299, 0.00938598, 0.01370834, 0.00916345, 0.01765375,
       0.00784536, 0.01741912, 0.01279351, 0.040648  , 0.        ,
       0.04126125, 0.02594856, 0.03452893, 0.03846678, 0.02207903,
       0.01994398, 0.01740037, 0.0222661 , 0.01897143, 0.01410815,
       0.01164299, 0.01148841, 0.01196362, 0.01123502, 0.00157063,
       0.        , 0.        , 0.        , 0.0143965 , 0.01485957,
       0.05797385, 0.04277541, 0.01210801, 0.01744637, 0.01773521,
       0.01511438, 0.01538623, 0.00195696, 0.00186836, 0.        ,
       0.01024616, 0.01165057, 0.00438816, 0.        , 0.        ,
       0.0102031 , 0.0181855 , 0.01495315, 0.01745683, 0.01380961,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0121931 , 0.01451734, 0.01050474, 0.01088544,
       0.0273284 , 0.01262056, 0.01228421, 0.01345375, 0.00610833,
       0.00169044, 0.00751617, 0.00607494, 0.00722995, 0.00170

In [31]:
fs

['Fwd Packets/s',
 'Bwd Packets/s',
 'Flow IAT Mean',
 'Flow Bytes/s',
 'Flow IAT Min',
 'Flow IAT Max']

In [32]:
#Read dataset
df1 = pd.read_csv('./Downloads/df_noise_50%.csv', encoding='utf-8')
# The results in this code is based on the original CICIDS2017 dataset. Please go to cell [10] if you work on the sampled dataset. 

In [33]:
# Randomly sample instances from majority classes
df1_minor = df1[(df1['Label']=='WebAttack')|(df1['Label']=='Bot')|(df1['Label']=='Infiltration')]
df1_BENIGN = df1[(df1['Label']=='BENIGN')]
df1_BENIGN = df1_BENIGN.sample(n=None, frac=0.01, replace=False, weights=None, random_state=None, axis=0)
df1_DoS = df1[(df1['Label']=='DoS')]
df1_DoS = df1_DoS.sample(n=None, frac=0.05, replace=False, weights=None, random_state=None, axis=0)
df1_PortScan = df1[(df1['Label']=='PortScan')]
df1_PortScan = df1_PortScan.sample(n=None, frac=0.05, replace=False, weights=None, random_state=None, axis=0)
df1_BruteForce = df1[(df1['Label']=='BruteForce')]
df1_BruteForce = df1_BruteForce.sample(n=None, frac=0.2, replace=False, weights=None, random_state=None, axis=0)

In [34]:
df1_s = df1_BENIGN.append(df1_DoS).append(df1_PortScan).append(df1_BruteForce).append(df1_minor)

In [35]:
df1_s = df1_s.sort_index()

In [36]:
# Save the sampled dataset
df1_s.to_csv('./Downloads/CICIDS2017_sample2.csv',index=0)

In [37]:
# Min-max normalization
numeric_features = df1.dtypes[df1.dtypes != 'object'].index
df1[numeric_features] = df1[numeric_features].apply(
    lambda x: (x - x.min()) / (x.max()-x.min()))
# Fill empty values by 0
df1 = df1.fillna(0)

In [38]:
labelencoder = LabelEncoder()
df1.iloc[:, -1] = labelencoder.fit_transform(df1.iloc[:, -1])
X = df1.drop(['Label'],axis=1).values 
y = df1.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)


In [39]:
X_fs = df1[fs].values

In [40]:
X_fs

array([[1.07695297e-04, 2.17731797e-04, 1.11817325e-05, 0.00000000e+00,
        1.68067224e-08, 1.94941176e-04],
       [6.44946007e-05, 1.17772749e-04, 1.99786555e-05, 0.00000000e+00,
        4.20168060e-08, 1.96932773e-04],
       [1.13627141e-04, 2.05224530e-04, 1.12891699e-05, 0.00000000e+00,
        3.36134448e-08, 2.03201681e-04],
       ...,
       [3.12154062e-07, 7.80385154e-07, 3.51137706e-03, 0.00000000e+00,
        4.20168060e-08, 2.79599580e-02],
       [2.06050678e-05, 2.31807013e-05, 9.06288515e-05, 0.00000000e+00,
        1.93277308e-07, 5.27050420e-04],
       [7.70950582e-06, 1.92737646e-05, 1.55714286e-04, 0.00000000e+00,
        4.20168060e-08, 1.08077311e-03]])

In [41]:
SUM = 0
MEAN = 0
#Machine learning model training after feature selection
for i in range(500):
    X_train, X_test, y_train, y_test = train_test_split(X_fs,y, train_size = 0.8, test_size = 0.2, random_state = None,stratify = y)
    if Max == 0:
        dt = DecisionTreeClassifier(random_state = None)
        dt.fit(X_train,y_train) 
        dt_score=dt.score(X_test,y_test)
        y_predict=dt.predict(X_test)
        y_true=y_test
        print('Accuracy of DT: '+ str(dt_score))
        SUM = SUM + dt_score
    if Max == 1:
        rf = RandomForestClassifier(random_state = None)
        rf.fit(X_train,y_train) 
        rf_score=rf.score(X_test,y_test)
        y_predict=rf.predict(X_test)
        y_true=y_test
        print('Accuracy of RF: '+ str(rf_score))
        SUM = SUM + rf_score
    if Max == 2: 
        et = ExtraTreesClassifier(random_state = None)
        et.fit(X_train,y_train) 
        et_score=et.score(X_test,y_test)
        y_predict=et.predict(X_test)
        y_true=y_test
        print('Accuracy of ET: '+ str(et_score))
        SUM = SUM + et_score
    if Max == 3:
        xg = xgb.XGBClassifier(n_estimators = 10, random_state = None)
        xg.fit(X_train,y_train)
        xg_score=xg.score(X_test,y_test)
        y_predict=xg.predict(X_test)
        y_true=y_test
        print('Accuracy of XGB: '+ str(xg_score))
        SUM = SUM + xg_score
    
print('Sum: '+ str(SUM))
MEAN = SUM/500
print('Accuracy: '+ str(MEAN))

Accuracy of ET: 0.3511699779249448
Accuracy of ET: 0.3499337748344371
Accuracy of ET: 0.3528476821192053
Accuracy of ET: 0.3511699779249448
Accuracy of ET: 0.3429580573951435
Accuracy of ET: 0.35152317880794703
Accuracy of ET: 0.348962472406181
Accuracy of ET: 0.3512582781456954
Accuracy of ET: 0.3554083885209713
Accuracy of ET: 0.343841059602649
Accuracy of ET: 0.3472847682119205
Accuracy of ET: 0.3545253863134658
Accuracy of ET: 0.3493156732891832
Accuracy of ET: 0.357439293598234
Accuracy of ET: 0.3569094922737307
Accuracy of ET: 0.3462251655629139
Accuracy of ET: 0.35532008830022077
Accuracy of ET: 0.34975717439293597
Accuracy of ET: 0.34507726269315675
Accuracy of ET: 0.34816777041942604
Accuracy of ET: 0.35275938189845474
Accuracy of ET: 0.35143487858719646
Accuracy of ET: 0.3529359823399558
Accuracy of ET: 0.3494039735099338
Accuracy of ET: 0.3425165562913907
Accuracy of ET: 0.34684326710816776
Accuracy of ET: 0.35328918322295805
Accuracy of ET: 0.3559381898454746
Accuracy of ET

Accuracy of ET: 0.3494922737306843
Accuracy of ET: 0.34463576158940395
Accuracy of ET: 0.3509933774834437
Accuracy of ET: 0.35275938189845474
Accuracy of ET: 0.35479028697571746
Accuracy of ET: 0.3473730684326711
Accuracy of ET: 0.3465783664459161
Accuracy of ET: 0.3605298013245033
Accuracy of ET: 0.34631346578366445
Accuracy of ET: 0.3458719646799117
Accuracy of ET: 0.34852097130242826
Accuracy of ET: 0.3576158940397351
Accuracy of ET: 0.3504635761589404
Accuracy of ET: 0.3517880794701987
Accuracy of ET: 0.34966887417218545
Accuracy of ET: 0.3477262693156733
Accuracy of ET: 0.3471081677704194
Accuracy of ET: 0.3508167770419426
Accuracy of ET: 0.3569977924944812
Accuracy of ET: 0.3484326710816777
Accuracy of ET: 0.34410596026490065
Accuracy of ET: 0.3479028697571744
Accuracy of ET: 0.34922737306843266
Accuracy of ET: 0.3467549668874172
Accuracy of ET: 0.3586754966887417
Accuracy of ET: 0.35108167770419424
Accuracy of ET: 0.3508167770419426
Accuracy of ET: 0.35514348785871963
Accuracy o

Accuracy of ET: 0.3512582781456954
Accuracy of ET: 0.3555849889624724
Accuracy of ET: 0.35426048565121415
Accuracy of ET: 0.35143487858719646
Accuracy of ET: 0.3502869757174393
Accuracy of ET: 0.3479028697571744
Accuracy of ET: 0.34913907284768214
Accuracy of ET: 0.3458719646799117
Accuracy of ET: 0.35328918322295805
Accuracy of ET: 0.35143487858719646
Accuracy of ET: 0.3513465783664459
Accuracy of ET: 0.3569977924944812
Accuracy of ET: 0.3500220750551876
Accuracy of ET: 0.3504635761589404
Accuracy of ET: 0.3425165562913907
Accuracy of ET: 0.35602649006622517
Accuracy of ET: 0.3517880794701987
Accuracy of ET: 0.3504635761589404
Accuracy of ET: 0.35037527593818985
Accuracy of ET: 0.34966887417218545
Accuracy of ET: 0.34975717439293597
Accuracy of ET: 0.35381898454746136
Accuracy of ET: 0.35222958057395143
Accuracy of ET: 0.3499337748344371
Accuracy of ET: 0.3529359823399558
Accuracy of ET: 0.3530242825607064
Accuracy of ET: 0.3436644591611479
Accuracy of ET: 0.3524061810154525
Accuracy 

In [186]:
fs

['Fwd Packets/s',
 'Flow Bytes/s',
 'Bwd Packets/s',
 'Flow Packets/s',
 'Init_Win_bytes_forward',
 'Flow IAT Mean']

In [103]:
xg = xgb.XGBClassifier(n_estimators = 10, random_state = None)
xg.fit(X_train,y_train)
xg_score=xg.score(X_test,y_test)
print('Accuracy of XGBoost: '+ str(xg_score))

Accuracy of XGBoost: 0.7501103752759382


In [340]:
f_list

[(0.5589, 'Min Packet Length'),
 (0.0825, 'Bwd Packet Length Std'),
 (0.0503, 'Bwd Packets/s'),
 (0.0454, 'Bwd Packet Length Min'),
 (0.0336, 'Average Packet Size'),
 (0.0226, 'PSH Flag Count'),
 (0.0211, 'Fwd IAT Std'),
 (0.0155, 'Fwd Header Length'),
 (0.0145, 'Bwd IAT Std'),
 (0.0141, 'Bwd Header Length'),
 (0.0125, 'Init_Win_bytes_backward'),
 (0.0119, 'Fwd PSH Flags'),
 (0.0106, 'Fwd Packet Length Max'),
 (0.0095, 'Flow IAT Max'),
 (0.0091, 'Packet Length Std'),
 (0.007, 'FIN Flag Count'),
 (0.0059, 'Packet Length Mean'),
 (0.0058, 'Total Length of Fwd Packets'),
 (0.0055, 'Fwd IAT Max'),
 (0.0054, 'Init_Win_bytes_forward'),
 (0.0053, 'URG Flag Count'),
 (0.0052, 'Bwd IAT Min'),
 (0.005, 'Total Fwd Packets'),
 (0.0043, 'act_data_pkt_fwd'),
 (0.0042, 'Total Backward Packets'),
 (0.0035, 'min_seg_size_forward'),
 (0.0031, 'Fwd IAT Min'),
 (0.0026, 'Bwd Packet Length Mean'),
 (0.0025, 'Fwd IAT Mean'),
 (0.0024, 'Bwd IAT Total'),
 (0.0022, 'Max Packet Length'),
 (0.0021, 'Fwd Packet L

In [48]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance
    
df = pd.read_csv('./Downloads/df_all_data_preprocessed_v9_0.1.csv', encoding='utf-8')

df_minor = df[(df['Label']=='WebAttack')|(df['Label']=='Bot')|(df['Label']=='Infiltration')]
df_BENIGN = df[(df['Label']=='BENIGN')]
df_BENIGN = df_BENIGN.sample(n=None, frac=0.01, replace=False, weights=None, random_state=None, axis=0)
df_DoS = df[(df['Label']=='DoS')]
df_DoS = df_DoS.sample(n=None, frac=0.05, replace=False, weights=None, random_state=None, axis=0)
df_PortScan = df[(df['Label']=='PortScan')]
df_PortScan = df_PortScan.sample(n=None, frac=0.05, replace=False, weights=None, random_state=None, axis=0)
df_BruteForce = df[(df['Label']=='BruteForce')]
df_BruteForce = df_BruteForce.sample(n=None, frac=0.2, replace=False, weights=None, random_state=None, axis=0)
    
df_s = df_BENIGN.append(df_DoS).append(df_PortScan).append(df_BruteForce).append(df_minor)
    
df_s = df_s.sort_index()
    
df_s.to_csv('./Downloads/CICIDS2017_sample1.csv',index=0)
    
numeric_features = df.dtypes[df.dtypes != 'object'].index
df[numeric_features] = df[numeric_features].apply(
    lambda x: (x - x.min()) / (x.max()-x.min()))
# Fill empty values by 0
df = df.fillna(0)
    
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])
X = df.drop(['Label'],axis=1).values 
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state = None,stratify = y)
    
train_accuracy = []
importance = []
    
dt = DecisionTreeClassifier(random_state = None)
dt.fit(X_train,y_train) 
dt_score=dt.score(X_test,y_test)
train_accuracy.append(dt_score)
    
rf = RandomForestClassifier(random_state = None)
rf.fit(X_train,y_train) 
rf_score=rf.score(X_test,y_test)
train_accuracy.append(rf_score)
    
et = ExtraTreesClassifier(random_state = None)
et.fit(X_train,y_train) 
et_score=et.score(X_test,y_test)
train_accuracy.append(et_score)
    
xg = xgb.XGBClassifier(n_estimators = 10, random_state = None)
xg.fit(X_train,y_train)
xg_score=xg.score(X_test,y_test)
train_accuracy.append(xg_score)
    
Max = train_accuracy.index(max(train_accuracy))
    
dt_feature = dt.feature_importances_
importance.append(dt_feature)
rf_feature = rf.feature_importances_
importance.append(rf_feature)
et_feature = et.feature_importances_
importance.append(et_feature)
xgb_feature = xg.feature_importances_
importance.append(xgb_feature)
    
feature=(df.drop(['Label'],axis=1)).columns.values
    
f_list = sorted(zip(map(lambda x: round(x, 4),importance[Max]), feature), reverse=True)

Sum = 0       
fs = []
for i in range(6):
    fs.append(f_list[i][1])
        
df1 = pd.read_csv('./Downloads/CICIDS2017_sample.csv', encoding='utf-8')
    
df1_minor = df1[(df1['Label']=='WebAttack')|(df1['Label']=='Bot')|(df1['Label']=='Infiltration')]
df1_BENIGN = df1[(df1['Label']=='BENIGN')]
df1_BENIGN = df1_BENIGN.sample(n=None, frac=0.01, replace=False, weights=None, random_state=None, axis=0)
df1_DoS = df1[(df1['Label']=='DoS')]
df1_DoS = df1_DoS.sample(n=None, frac=0.05, replace=False, weights=None, random_state=None, axis=0)
df1_PortScan = df1[(df1['Label']=='PortScan')]
df1_PortScan = df1_PortScan.sample(n=None, frac=0.05, replace=False, weights=None, random_state=None, axis=0)
df1_BruteForce = df1[(df1['Label']=='BruteForce')]
df1_BruteForce = df1_BruteForce.sample(n=None, frac=0.2, replace=False, weights=None, random_state=None, axis=0)
    
df1_s = df1_BENIGN.append(df1_DoS).append(df1_PortScan).append(df1_BruteForce).append(df1_minor)
    
df1_s = df1_s.sort_index()
    
df1_s.to_csv('./Downloads/CICIDS2017_sample2.csv',index=0)
    
# Min-max normalization
numeric_features = df1.dtypes[df1.dtypes != 'object'].index
df1[numeric_features] = df1[numeric_features].apply(
    lambda x: (x - x.min()) / (x.max()-x.min()))
# Fill empty values by 0
df1 = df1.fillna(0)
    
labelencoder = LabelEncoder()
df1.iloc[:, -1] = labelencoder.fit_transform(df1.iloc[:, -1])
X = df1.drop(['Label'],axis=1).values 
y = df1.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)
    
X_fs = df1[fs].values
    
SUM = 0
MEAN = 0
#Machine learning model training after feature selection
for i in range(500):
    X_train, X_test, y_train, y_test = train_test_split(X_fs,y, train_size = 0.8, test_size = 0.2, random_state = None,stratify = y)
    if Max == 0:
        dt = DecisionTreeClassifier(random_state = None)
        dt.fit(X_train,y_train) 
        dt_score=dt.score(X_test,y_test)
        y_predict=dt.predict(X_test)
        y_true=y_test
        print('Accuracy of DT: '+ str(dt_score))
        SUM = SUM + dt_score
    if Max == 1:
        rf = RandomForestClassifier(random_state = None)
        rf.fit(X_train,y_train) 
        rf_score=rf.score(X_test,y_test)
        y_predict=rf.predict(X_test)
        y_true=y_test
        print('Accuracy of RF: '+ str(rf_score))
        SUM = SUM + rf_score
    if Max == 2: 
        et = ExtraTreesClassifier(random_state = None)
        et.fit(X_train,y_train) 
        et_score=et.score(X_test,y_test)
        y_predict=et.predict(X_test)
        y_true=y_test
        print('Accuracy of ET: '+ str(et_score))
        SUM = SUM + et_score
    if Max == 3:
        xg = xgb.XGBClassifier(n_estimators = 10, random_state = None)
        xg.fit(X_train,y_train)
        xg_score=xg.score(X_test,y_test)
        y_predict=xg.predict(X_test)
        y_true=y_test
        print('Accuracy of XGB: '+ str(xg_score))
        SUM = SUM + xg_score
    
MEAN = SUM/500
print('Accuracy: '+ str(MEAN))

Accuracy of RF: 0.8841501103752759
Accuracy of RF: 0.8898896247240619
Accuracy of RF: 0.8858278145695364
Accuracy of RF: 0.8847682119205298
Accuracy of RF: 0.8858278145695364
Accuracy of RF: 0.888476821192053
Accuracy of RF: 0.8844150110375276
Accuracy of RF: 0.887682119205298
Accuracy of RF: 0.887682119205298
Accuracy of RF: 0.8890066225165563
Accuracy of RF: 0.8841501103752759
Accuracy of RF: 0.8890066225165563
Accuracy of RF: 0.8865342163355409
Accuracy of RF: 0.885916114790287
Accuracy of RF: 0.8860927152317881
Accuracy of RF: 0.885916114790287
Accuracy of RF: 0.8878587196467991
Accuracy of RF: 0.8861810154525386
Accuracy of RF: 0.8847682119205298
Accuracy of RF: 0.8862693156732891
Accuracy of RF: 0.8837969094922737
Accuracy of RF: 0.883532008830022
Accuracy of RF: 0.8855629139072848
Accuracy of RF: 0.8879470198675496
Accuracy of RF: 0.8889183222958057
Accuracy of RF: 0.8854746136865342
Accuracy of RF: 0.8852097130242825
Accuracy of RF: 0.8841501103752759
Accuracy of RF: 0.88618101

Accuracy of RF: 0.8866225165562914
Accuracy of RF: 0.8897130242825607
Accuracy of RF: 0.8836203090507726
Accuracy of RF: 0.8873289183222958
Accuracy of RF: 0.8852980132450331
Accuracy of RF: 0.8846799116997792
Accuracy of RF: 0.8856512141280353
Accuracy of RF: 0.890242825607064
Accuracy of RF: 0.8852980132450331
Accuracy of RF: 0.887682119205298
Accuracy of RF: 0.8860927152317881
Accuracy of RF: 0.8864459161147903
Accuracy of RF: 0.8856512141280353
Accuracy of RF: 0.8837086092715232
Accuracy of RF: 0.8867108167770419
Accuracy of RF: 0.8842384105960265
Accuracy of RF: 0.8889183222958057
Accuracy of RF: 0.8895364238410596
Accuracy of RF: 0.889271523178808
Accuracy of RF: 0.8803532008830022
Accuracy of RF: 0.8890066225165563
Accuracy of RF: 0.8863576158940397
Accuracy of RF: 0.8830905077262693
Accuracy of RF: 0.888476821192053
Accuracy of RF: 0.8943929359823399
Accuracy of RF: 0.885916114790287
Accuracy of RF: 0.8890066225165563
Accuracy of RF: 0.8858278145695364
Accuracy of RF: 0.8877704

Accuracy of RF: 0.8836203090507726
Accuracy of RF: 0.8853863134657837
Accuracy of RF: 0.8877704194260486
Accuracy of RF: 0.8857395143487858
Accuracy of RF: 0.8854746136865342
Accuracy of RF: 0.8886534216335541
Accuracy of RF: 0.8853863134657837
Accuracy of RF: 0.890242825607064
Accuracy of RF: 0.8866225165562914
Accuracy of RF: 0.884326710816777
Accuracy of RF: 0.8845916114790286
Accuracy of RF: 0.8866225165562914
Accuracy of RF: 0.8864459161147903
Accuracy of RF: 0.8877704194260486
Accuracy of RF: 0.884326710816777
Accuracy of RF: 0.8875938189845475
Accuracy of RF: 0.8875938189845475
Accuracy of RF: 0.8904194260485652
Accuracy of RF: 0.8838852097130243
Accuracy of RF: 0.8875055187637969
Accuracy of RF: 0.8879470198675496
Accuracy of RF: 0.886887417218543
Accuracy of RF: 0.89280353200883
Accuracy of RF: 0.8860927152317881
Accuracy of RF: 0.8867991169977925
Accuracy of RF: 0.8837969094922737
Accuracy of RF: 0.8893598233995585
Accuracy of RF: 0.8837969094922737
Accuracy of RF: 0.88459161

In [2]:
MEAN

0.9947950551876383