In [1]:
# Importing required libraries for the project
import sys # for python library version
import numpy as np # for scientific computing
import pandas as pd # for data anaysis
import matplotlib # for visualization
import seaborn as sns # for visualization
import sklearn # ML Library

In [2]:
# importing the dataset to a variable
data = pd.read_csv("K:/CIC-2017-dataset/CIC-IDS-2017/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv")

# displaying first 3 observations
data.head(3)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.0,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN



# Getting Started

First thing first, we need to import/read the dataset and have a peak at it....

We can see that the data has been imported successfully. Now we need to know the number of observations and features we have.

In [3]:
# we can delete the redundant feature columns whose values are always zero
data = data.drop([" Bwd PSH Flags",  # 0
           " Fwd URG Flags",  # 0
           " Bwd URG Flags",
           " CWE Flag Count",
           "Fwd Avg Bytes/Bulk",  # 0
           " Fwd Avg Packets/Bulk", # 0
           " Fwd Avg Bulk Rate",  # 0
           " Bwd Avg Bytes/Bulk",  # 0
           " Bwd Avg Packets/Bulk", # 0
           "Bwd Avg Bulk Rate"], axis=1)

In [4]:
# we can also delete NAN values
data = data.drop(['Flow Bytes/s',' Flow Packets/s'], axis=1)

In [5]:
# dimensions of the data
# where x will be no. of observation
# and y will be features including 1 target variable
x, y = data.shape   # x=692703     y=67

print('We have ', x, ' number of observations and ', y-1, ' features for this dataset to predict type of traffic.')  # removing count of a target variable in 'y'

We have  692703  number of observations and  66  features for this dataset to predict type of traffic.


In [6]:
# grouping by forest cover type and calculating total occurance
data.groupby(' Label').size()

 Label
BENIGN              440031
DoS GoldenEye        10293
DoS Hulk            231073
DoS Slowhttptest      5499
DoS slowloris         5796
Heartbleed              11
dtype: int64

In [5]:

data[' Label'] = data[' Label'].map({'BENIGN': 0, 
                                     'DoS Hulk': 1,
                                     'DoS GoldenEye':2,
                                     'DoS slowloris':3,
                                     'DoS Slowhttptest':4,
                                     'Heartbleed':5})

In [27]:
# Attack Class Distribution
data[' Label'].value_counts()

0    440031
1    231073
2     10293
3      5796
4      5499
5        11
Name:  Label, dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692703 entries, 0 to 692702
Data columns (total 67 columns):
 Destination Port               692703 non-null int64
 Flow Duration                  692703 non-null int64
 Total Fwd Packets              692703 non-null int64
 Total Backward Packets         692703 non-null int64
Total Length of Fwd Packets     692703 non-null int64
 Total Length of Bwd Packets    692703 non-null int64
 Fwd Packet Length Max          692703 non-null int64
 Fwd Packet Length Min          692703 non-null int64
 Fwd Packet Length Mean         692703 non-null float64
 Fwd Packet Length Std          692703 non-null float64
Bwd Packet Length Max           692703 non-null int64
 Bwd Packet Length Min          692703 non-null int64
 Bwd Packet Length Mean         692703 non-null float64
 Bwd Packet Length Std          692703 non-null float64
 Flow IAT Mean                  692703 non-null float64
 Flow IAT Std                   692703 non-null float64
 Flow IAT Max

In [28]:
data.isnull().values.any()

False

In [10]:
# will delete observation if it has any missing values in any of the features.
data.dropna()

# shape of the data after deleting missing entries
data.shape

(692703, 67)

In [11]:
# deleting duplicates, except the first observation
data.drop_duplicates(keep='first')

# shape of the data after deleting duplicate entries
data.shape

(692703, 67)

No missing values and no duplicates

In [6]:
X = data.drop([' Label'], axis=1)    # 692703 × 66
# feeding our target variable to var 'y'
y = data[' Label']

In [None]:
X.shape

# Feature Engineering

# 1. Extra-Tree Classifier

In [None]:
# importing model for feature importance
from sklearn.ensemble import ExtraTreesClassifier

# passing the model
model = ExtraTreesClassifier(random_state = 53)


# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ETC']).sort_values('ETC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
ETC_feature_importances.head(25)

# 2. Randon-Forest Classifier

In [None]:
# importing model for feature importance
from sklearn.ensemble import RandomForestClassifier

# passing the model
model = RandomForestClassifier(random_state = 53)

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
RFC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['RFC']).sort_values('RFC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
RFC_feature_importances.head(25)

# 3. Gradient Boosting Classifier

In [None]:
# importing model for feature importance
from sklearn.ensemble import GradientBoostingClassifier

# passing the model
model = GradientBoostingClassifier(random_state = 53)

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
GBC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['GBC']).sort_values('GBC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
GBC_feature_importances.head(25)

# 4. XGBoost feature selection

In [None]:
# plot feature importance manually
from numpy import loadtxt
from xgboost import XGBClassifier
from matplotlib import pyplot

In [None]:
# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# feature importance
print(model.feature_importances_)
# plot
pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
pyplot.show()

In [None]:
# plot feature importance using built-in function
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot

# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# plot feature importance
plot_importance(model)
pyplot.show()

# 5. Adaboost Classifier

In [None]:
# importing model for feature importance
from sklearn.ensemble import AdaBoostClassifier

# passing the model
model = AdaBoostClassifier(random_state = 53)

model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ADB_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ADB']).sort_values('ADB', ascending=False)

# removing traces of this model
model = None

ADB_feature_importances.head(25)


In [7]:
## feeding top 18 features in a variable as dataframe including target variable

sample = data[[' Destination Port', 'Init_Win_bytes_forward', ' Average Packet Size', ' Fwd IAT Std', 
               ' Bwd Packet Length Std', ' Packet Length Std', ' Fwd Packet Length Mean',  
               ' Packet Length Mean', ' Bwd Packets/s', 'Bwd Packet Length Max', 'Idle Mean', ' Avg Bwd Segment Size', 
               'FIN Flag Count', ' Flow IAT Max', ' Flow Duration', ' Total Length of Bwd Packets', ' Min Packet Length', ' Label']]

In [30]:
sample.shape     # (692703, 18)

(692703, 18)

In [8]:
# importing feature scaling function
from sklearn.preprocessing import MinMaxScaler

# passing range to the function and then save it
scaler = MinMaxScaler(feature_range = (0,1))

# feeding sample features to var 'X'
X = sample.iloc[:,:-1]

# feeding our target variable to var 'y'
y = sample[' Label']

# apply feature scaling to all features
X_scaled = scaler.fit_transform(X)
#s_sample_2 = scaler.fit_transform(X2)

  return self.partial_fit(X, y)


In [32]:
X_scaled

array([[1.22161650e-03, 3.90625000e-03, 3.44563553e-03, ...,
        3.19241669e-04, 9.56937799e-09, 4.14364641e-03],
       [5.94011025e-03, 4.45571899e-01, 1.19161562e-02, ...,
        4.00000003e-06, 5.19936204e-07, 0.00000000e+00],
       [1.34377815e-03, 4.45571899e-01, 1.50746554e-01, ...,
        9.13333341e-06, 5.02392344e-06, 0.00000000e+00],
       ...,
       [8.86130072e-01, 1.53656006e-02, 8.67789689e-03, ...,
        6.91666672e-07, 9.56937799e-09, 0.00000000e+00],
       [8.09320934e-04, 0.00000000e+00, 2.29709035e-02, ...,
        8.73863341e-03, 4.08293461e-07, 2.20994475e-02],
       [8.09320934e-04, 0.00000000e+00, 2.94155181e-02, ...,
        7.91166673e-04, 3.60446571e-07, 3.24585635e-02]])

# Train-Test Split

In [9]:
# importing train-test function
from sklearn.model_selection import train_test_split

# split the data in 75%-25% train-test respectively with fixed state
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state = 53)

In [10]:
# number of training observation
print(X_train.shape, X_test.shape)

(519527, 17) (173176, 17)


In [11]:
y_test.shape

(173176,)

# Classifier/ generating classification model

# 1.Xgboost classifier

Train-Test Split

In [39]:
# use feature importance for feature selection
from numpy import loadtxt
from numpy import sort
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

# fit model on all training data
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data and evaluate
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
	# select features using threshold
	selection = SelectFromModel(model, threshold=thresh, prefit=True)
	select_X_train = selection.transform(X_train)
	# train model
	selection_model = XGBClassifier()
	selection_model.fit(select_X_train, y_train)
	# eval model
	select_X_test = selection.transform(X_test)
	y_pred = selection_model.predict(select_X_test)
	predictions = [round(value) for value in y_pred]
	accuracy = accuracy_score(y_test, predictions)
	print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))

Accuracy: 99.88%
Thresh=0.019, n=17, Accuracy: 99.88%
Thresh=0.019, n=16, Accuracy: 99.89%
Thresh=0.023, n=15, Accuracy: 99.87%
Thresh=0.028, n=14, Accuracy: 99.86%
Thresh=0.030, n=13, Accuracy: 99.88%
Thresh=0.045, n=12, Accuracy: 99.86%
Thresh=0.048, n=11, Accuracy: 99.86%
Thresh=0.049, n=10, Accuracy: 99.85%
Thresh=0.052, n=9, Accuracy: 99.85%
Thresh=0.055, n=8, Accuracy: 99.85%
Thresh=0.056, n=7, Accuracy: 99.84%
Thresh=0.061, n=6, Accuracy: 99.80%
Thresh=0.066, n=5, Accuracy: 99.39%
Thresh=0.079, n=4, Accuracy: 99.19%
Thresh=0.088, n=3, Accuracy: 98.59%
Thresh=0.103, n=2, Accuracy: 95.28%
Thresh=0.176, n=1, Accuracy: 93.67%


In [None]:
# select features using threshold
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
# train model
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
# eval model
select_X_test = selection.transform(X_test)
y_pred = selection_model.predict(select_X_test)

# Fitting other Models

In [12]:
from sklearn.svm import SVC 
from sklearn.naive_bayes import BernoulliNB 
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Train KNeighborsClassifier Model
KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier.fit(X_train, y_train); 

# Train LogisticRegression Model
LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier.fit(X_train, y_train);

# Train Gaussian Naive Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(X_train, y_train)
            
# Train Decision Tree Model
DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
DTC_Classifier.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

# Evaluate previous models

In [14]:
from sklearn import metrics

models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('KNeighborsClassifier', KNN_Classifier))
models.append(('LogisticRegression', LGR_Classifier))

for i, v in models:
    scores = cross_val_score(v, X_train, y_train, cv=10)
    accuracy = metrics.accuracy_score(y_train, v.predict(X_train))
    confusion_matrix = metrics.confusion_matrix(y_train, v.predict(X_train))
    classification = metrics.classification_report(y_train, v.predict(X_train))
    print()
    print('============================== {} Model Evaluation =============================='.format(i))
    print()
    print ("Cross Validation Mean Score:" "\n", scores.mean())
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()

  'precision', 'predicted', average, warn_for)




Cross Validation Mean Score:
 0.6684272265376705

Model Accuracy:
 0.671204768953298

Confusion matrix:
 [[222926  76395      0      0  30947      0]
 [   130 123666      0      0  49347      0]
 [   262   5502      0      0   1931      0]
 [   758   2315      0      0   1223      0]
 [  1071    928      0      0   2117      0]
 [     0      9      0      0      0      0]]

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.67      0.80    330268
           1       0.59      0.71      0.65    173143
           2       0.00      0.00      0.00      7695
           3       0.00      0.00      0.00      4296
           4       0.02      0.51      0.05      4116
           5       0.00      0.00      0.00         9

   micro avg       0.67      0.67      0.67    519527
   macro avg       0.27      0.32      0.25    519527
weighted avg       0.83      0.67      0.73    519527








Cross Validation Mean Score:
 0.9994725948617257

Model Accuracy:
 0.9998652620556776

Confusion matrix:
 [[330220     46      0      0      2      0]
 [     2 173137      4      0      0      0]
 [     0      3   7691      0      1      0]
 [     2      0      0   4293      1      0]
 [     2      0      0      7   4107      0]
 [     0      0      0      0      0      9]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    330268
           1       1.00      1.00      1.00    173143
           2       1.00      1.00      1.00      7695
           3       1.00      1.00      1.00      4296
           4       1.00      1.00      1.00      4116
           5       1.00      1.00      1.00         9

   micro avg       1.00      1.00      1.00    519527
   macro avg       1.00      1.00      1.00    519527
weighted avg       1.00      1.00      1.00    519527








Cross Validation Mean Score:
 0.9991376752060294

Model Accuracy:
 0.9993609571783565

Confusion matrix:
 [[330093    103     31     19     21      1]
 [    34 173095     12      2      0      0]
 [    14     12   7662      0      7      0]
 [    20      3      1   4263      9      0]
 [    18      1      3     20   4074      0]
 [     1      0      0      0      0      8]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    330268
           1       1.00      1.00      1.00    173143
           2       0.99      1.00      0.99      7695
           3       0.99      0.99      0.99      4296
           4       0.99      0.99      0.99      4116
           5       0.89      0.89      0.89         9

   micro avg       1.00      1.00      1.00    519527
   macro avg       0.98      0.98      0.98    519527
weighted avg       1.00      1.00      1.00    519527




  'precision', 'predicted', average, warn_for)




Cross Validation Mean Score:
 0.9582485640587771

Model Accuracy:
 0.9585084124597951

Confusion matrix:
 [[321907   7954    132    133    142      0]
 [  1281 171577    106     54    125      0]
 [  2397   1428   3611    259      0      0]
 [  2825    451    764    138    118      0]
 [  2931    447      0      0    738      0]
 [     9      0      0      0      0      0]]

Classification report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97    330268
           1       0.94      0.99      0.97    173143
           2       0.78      0.47      0.59      7695
           3       0.24      0.03      0.06      4296
           4       0.66      0.18      0.28      4116
           5       0.00      0.00      0.00         9

   micro avg       0.96      0.96      0.96    519527
   macro avg       0.60      0.44      0.48    519527
weighted avg       0.95      0.96      0.95    519527




# Validating Models

In [15]:
for i, v in models:
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    classification = metrics.classification_report(y_test, v.predict(X_test))
    print()
    print('============================== {} Model Test Results =============================='.format(i))
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()  

  'precision', 'predicted', average, warn_for)




Model Accuracy:
 0.6697001894026886

Confusion matrix:
 [[73961 25502     0     0 10300     0]
 [   40 41309     0     0 16581     0]
 [   88  1862     0     0   648     0]
 [  293   783     0     0   424     0]
 [  373   304     0     0   706     0]
 [    0     2     0     0     0     0]]

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.67      0.80    109763
           1       0.59      0.71      0.65     57930
           2       0.00      0.00      0.00      2598
           3       0.00      0.00      0.00      1500
           4       0.02      0.51      0.05      1383
           5       0.00      0.00      0.00         2

   micro avg       0.67      0.67      0.67    173176
   macro avg       0.27      0.32      0.25    173176
weighted avg       0.83      0.67      0.72    173176




Model Accuracy:
 0.9994571996119554

Confusion matrix:
 [[109721     26      9      4      3      0]
 [    13  57912      5      0      

  'precision', 'predicted', average, warn_for)


In [17]:
# PREDICTING FOR TEST DATA using KNN
pred_knn = KNN_Classifier.predict(X_scaled)
pred_NB = BNB_Classifier.predict(X_scaled)
pred_log = LGR_Classifier.predict(X_scaled)
pred_dt = DTC_Classifier.predict(X_scaled)

In [None]:
# Attack Class Distribution
data[' Label'].value_counts()