In [1]:
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join

# Machile learning algorithms from sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_roc_curve
from sklearn.preprocessing import  MinMaxScaler 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import random
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import RFECV

# from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style="darkgrid")

np.random.seed(1234)

import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')

In [2]:
trainigRootPath = "../../Feature_Slection(20features)/ADASYN/Training Data"
trainingFiles = [join(trainigRootPath, f) for f in listdir(trainigRootPath) if isfile(join(trainigRootPath, f))]
trainingFiles.remove(trainigRootPath+"/.DS_Store")

In [4]:
df = pd.concat(map(pd.read_csv, trainingFiles),ignore_index=True)

In [5]:
y = df.Label
# Label encoding the classes 
encoder = LabelEncoder()
y = encoder.fit_transform(y)
X = df[df.columns[:-1]]

In [6]:
classDict = dict(zip(np.unique(y), list(encoder.classes_)))
classDict

{0: 'BENIGN',
 1: 'BotnetARES',
 2: 'BruteForce',
 3: 'DoS',
 4: 'Infiltration',
 5: 'PortScan',
 6: 'WebAttack'}

In [7]:
def prediction(X_train, X_test, y_train, y_test):
    
    prediction_f1 = {}
    
    # Support Vector Machine
    _SVC = SVC(C = 0.4, kernel = 'rbf', gamma='auto')
    _SVC.fit(X_train, y_train)
    _SVC_prediction = _SVC.predict(X_test)
    prediction_f1['Support Vector Machine'] = round(f1_score(y_test, _SVC_prediction, average='micro'), 2)
    
    # KNeighbors Classifier
    _KNeighbors_classifier = KNeighborsClassifier(n_neighbors=int(len(X_train)/5))
    _KNeighbors_classifier.fit(X_train, y_train)
    _KNeighbors_classifier = _KNeighbors_classifier.predict(X_test)
    prediction_f1['KNeighbors Classifier'] = round(f1_score(y_test, _KNeighbors_classifier, average='micro'), 2)
    
    # Logistic Regression - Classifier
    _Logistic = LogisticRegression(multi_class='ovr', solver='liblinear')
    _Logistic.fit(X_train, y_train)
    _Logistic_prediction = _Logistic.predict(X_test)
    prediction_f1['Logistic Classifier'] = round(f1_score(y_test, _Logistic_prediction, average='micro'), 2)
    
    # Random Forest Classifier
    _RandomForestClassifier = RandomForestClassifier(n_estimators = 1000, min_samples_split = 15, random_state = 42)
    _RandomForestClassifier.fit(X_train, y_train)
    _RandomForestClassifier_prediction = _RandomForestClassifier.predict(X_test)
    prediction_f1['Random Forest'] = round(f1_score(y_test, _RandomForestClassifier_prediction, average='micro'), 2)
    
    # # Xgboost
    # _Xgboost = XGBClassifier()
    # _Xgboost.fit(X_train, y_train)
    # _Xgboost_prediction = _Xgboost.predict(X_val)
    # prediction_accuracy['Xgboost'] = round(f1_score(y_test, _Xgboost_prediction, average='micro'), 2)
    
    #accuracy DataFram
    prediction_accuracy_df = pd.DataFrame(prediction_f1.items(), columns=['Classifier', 'f1'], index=None)
    return prediction_accuracy_df


In [8]:
def accuracy_plot(a, title):
    fig, (ax1)  = plt.subplots(1, 1, figsize=(8,4))
    fig.suptitle(title)
    ax1.barh(np.arange(len(list(a.Classifier))), list(a.f1), color='#488cfa', height=0.9)
    for i in list(a.f1):
        ax1.text(i + 1, list(a.f1).index(i) , i,fontsize=12)
#     ax1.set_title( "NLKT stopwatch vocabulary Approach.")
    ax1.set_yticks(np.arange(len(list(a.Classifier))))
    ax1.set_yticklabels(list(a.Classifier))
    ax1.set_xlim((0, 1))
    ax1.set_xlabel("f1", fontsize=15)
    ax1.grid(False)
    ax1.set_ylabel("Models", fontsize=15)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
p_df = prediction(X_train, X_test, y_train, y_test) # Returns DataFrame