In [1]:
!pip install scikit-plot



In [2]:
import numpy as np
import pandas as pd
import pickle
%matplotlib inline

df = pd.read_csv('Dataset.csv')
df.head()

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Original_label
0,200926,12,40474,2014,443,6,07-05-2017 08:53,70130200,11,12,...,617212.0,0.0,617212,617212,69371900,0.0,69371900,69371900,37,SMSMALWARE_FAKEMART
1,255471,13,34095,2894,443,6,28-08-2017 10:16,20002200,8,13,...,162421.0,0.0,162421,162421,19839800,0.0,19839800,19839800,20,RANSOMWARE_SVPENG
2,57419,11,60228,5590,443,6,30-06-2017 01:11,63,1,1,...,0.0,0.0,0,0,0,0.0,0,0,36,SMSMALWARE_FAKEINST
3,106254,12,54188,698,80,6,20-06-2017 03:31,23445200,2,0,...,0.0,0.0,0,0,0,0.0,0,0,10,BENIGN
4,188352,12,46137,1934,443,6,27-06-2017 06:02,36938,1,1,...,0.0,0.0,0,0,0,0.0,0,0,33,SCAREWARE_VIRUSSHIELD


In [3]:
df.shape

(18851, 74)

In [4]:
df.isnull().sum()

Flow ID             0
Source IP           0
Source Port         0
Destination IP      0
Destination Port    0
                   ..
Idle Std            0
Idle Max            0
Idle Min            0
Label               0
Original_label      0
Length: 74, dtype: int64

In [5]:
from scikitplot.metrics import plot_precision_recall
import matplotlib.pyplot as plt

In [6]:
def save_model(model, frs, label, out_dir):
    model.fit(frs, label)
    pickle.dump(model, open(out_dir,'wb'))

In [7]:
def validate(in_dir, X_test, y_test, title):
    model = pickle.load(open(in_dir,'rb'))
    y_probas = model.predict_proba(X_test)
    plot_precision_recall(y_test, y_probas, classes_to_plot=[],
                          title=str('Precision-recall curve micro-averaged over all classes for ' + title))
    plt.show()

In [8]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [9]:
all_frs = df[df.columns.difference(['Label', 'Original_label', 'Timestamp'])]

In [10]:
X_scaled = preprocessing.scale(all_frs)

In [11]:
label = df.Label

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, label, test_size=0.3)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.dummy import DummyClassifier

In [14]:
param_grid = {
    'max_features' : [0.5,0.75,1.0],
    'bootstrap' : [True, False],
    'bootstrap_features' : [True, False]
}

In [None]:
CV_bagging = GridSearchCV(estimator=BaggingClassifier(), param_grid=param_grid, cv=5)
CV_bagging.fit(X_train, y_train)

In [None]:
CV_bagging.best_params_

In [None]:
models = [(DummyClassifier(strategy="most_frequent"), 'baseline.pkl', 'Zero R'),
          (BaggingClassifier(bootstrap = False, bootstrap_features = True), 'bagging.pkl', 'Bagging classifier')
         ]

In [None]:
for clr, out_dir, _ in models:    
    save_model(clr, X_train, y_train, out_dir)

In [None]:
for _, in_dir, title in models:    
    validate(in_dir, X_test, y_test, title)

In [None]:
from scikitplot.estimators import plot_learning_curve

In [None]:
for clr,  out_dir, _ in models:    
    plot_learning_curve(clr, X_train, y_train)


In [None]:
for clr,  out_dir, _ in models:    
    plot_learning_curve(clr, X_test, y_test)