In [13]:
import warnings

import tensorflow as tf
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np

from joblib import load
from numpy import genfromtxt

import pickle
import pandas as pd
import os

In [20]:
# Import Model
def import_model():
    list_model = []
    path = './models'
    for i in os.listdir(path):
        list_model.append(i)
    print(list_model)
    return list_model

In [4]:
# Import normal Dataset
def import_dataset():
    with open('./Dataset/csv/Attack_merge.csv', newline='') as csvfile:
        rows = pd.read_csv(csvfile,header=None)
        y = rows[10]
        x = rows.drop([10], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(x, y , test_size=0.2)
    return X_train, X_test, y_train, y_test

In [89]:
import itertools
def combinations(list_model, num):
    return list(x for x in itertools.combinations(list_model, num))

In [91]:
combination_model = combinations(['CNN.h5', 'DNN4.h5', 'DT.joblib', 'KNN.joblib', 'LR.joblib', 'LSTM.h5', 'RF.joblib', 'SVM.joblib', 'XGB.joblib'], 3)
print(combination_model[0][0])

CNN.h5


In [53]:
def kappa_statistics(pred_labels, y_test):
    ks = 0
    length = len(pred_labels)
    for n in range(len(pred_labels[0])):
        # value_b = ''
        for o in range(0, length):
            if o == 0:
                continue
            elif pred_labels[o][n] != pred_labels[o-1][n]:
                break
            elif o == length-1:
                ks += 1
            else:
                continue
    ks_percentage = ks/len(y_test) * 100
    print("KAPPA STATS: " + str(ks_percentage))

    return ks_percentage

In [54]:
def double_fault(pred_labels, y_test):
    wrong_gt = ([l for l, m in enumerate(y_test) if m != pred_labels[0][l]])
    df = 0
    length = len(pred_labels)
    for n, o in enumerate(wrong_gt):
        for p in range(0, length):
            if p == 0:
                continue
            elif pred_labels[p][o] != pred_labels[p-1][o]:
                break
            elif p == length-1:
                df += 1
            else:
                continue
    df_percentage = df/len(y_test) * 100
    print("DOUBLE FAULT: " + str(df_percentage))
    
    return df_percentage

In [68]:
def most_common(most):
    # common = []
    for b in range(0, len(most)):
        if len(most[b]) == 0:
            most[b] = b
        else:
            most[b] = max(set(most[b]), key=most[b].count)
    return max(set(most), key=most.count)

In [69]:
def prediction_test(clf, x_test):
    if str(clf)[0:3] == 'Lin':
        temp = (clf.predict(x_test))
        score_temp = []
        for i in range(0, len(temp)):
            if temp[i] == 0:
                score_temp.append(np.array([1.0, 0.0], dtype=float))
            else:
                score_temp.append(np.array([0.0, 1.0], dtype=float))
        return score_temp
    else:
        return clf.predict_proba(x_test)

In [70]:
def voting_average(list_model, X_test, y_test):
    #For soft voting:
    pred = np.asarray([prediction_test(clf, X_test) for clf in list_model])
    pred = np.average(pred, axis=0, weights=None)
    pred = np.argmax(pred, axis=1)
    pred_labels = np.rint(pred)
    f1 = sklearn.metrics.f1_score(y_test, pred_labels, average="weighted")
    
    return f1

In [95]:
def ensemble_learning(num_teams):
    good_team = []
    path = "./models/"
    list_model = import_model()
    X_train, X_test, y_train, y_test = import_dataset()
    y_train_ex = tf.expand_dims(y_train, axis=1)
    X_train_ex = tf.expand_dims(X_train, axis=2)
    y_test_ex = tf.expand_dims(y_test, axis=1)
    X_test_ex = tf.expand_dims(X_test, axis=2)

    combination_model = combinations(list_model=list_model, num = num_teams)
    print(combination_model)
    for j in range(0, len(combination_model)):
        pred_labels = []
        for k in range(0, len(combination_model[j])):
            if combination_model[j][k] == "CNN.h5" or combination_model[j][k] == "LSTM.h5":
                prediction = tf.keras.models.load_model(path + combination_model[j][k]).predict(X_test_ex)

            elif combination_model[j][k] == "DNN4.h5":
                prediction = tf.keras.models.load_model(path + combination_model[j][k]).predict(X_test)
                prediction = prediction[0:,1]

            else:
                prediction = load(path + combination_model[j][k]).predict(X_test)

            pred_labels.append(np.rint(prediction))
        #print(pred_labels)
        print(combination_model[j])
        ks_percentage = kappa_statistics(pred_labels, y_test)
        df_percentage = double_fault(pred_labels, y_test)
        
        if ks_percentage > 50 and ks_percentage < 91 and df_percentage < 50:
            good_team.append(combination_model[j])
    
    best_team = most_common(good_team)
    print("good_team", good_team)
    print("best_team", best_team)

    # f1_score = voting_average(best_team, X_test, y_test)
    # print("f1_score", f1_score)
    
    # preds = model.predict(X_test_ex, steps=1)
    # pred_labels = np.rint(preds)
    # accuracy = sklearn.metrics.accuracy_score(y_test_ex, pred_labels)
    # f1 = sklearn.metrics.f1_score(y_test_ex, pred_labels, average="binary")
    # prec = sklearn.metrics.precision_score(y_test_ex, pred_labels, average="binary")
    # recall = sklearn.metrics.recall_score(y_test_ex, pred_labels, average="binary")
    # print("accuracy: " + str(accuracy))
    # print("f1: " + str(f1))
    # print("prec: " + str(prec))
    # print("recall: " + str(recall))


In [84]:
most_common(['CNN.h5', 'DNN4.h5'])

'N'

In [85]:
ensemble_learning(2)

['CNN.h5', 'DNN4.h5', 'DT.joblib', 'KNN.joblib', 'LR.joblib', 'LSTM.h5', 'RF.joblib', 'SVM.joblib', 'XGB.joblib']
(('CNN.h5', 'DNN4.h5'), ('CNN.h5', 'DT.joblib'), ('CNN.h5', 'KNN.joblib'), ('CNN.h5', 'LR.joblib'), ('CNN.h5', 'LSTM.h5'), ('CNN.h5', 'RF.joblib'), ('CNN.h5', 'SVM.joblib'), ('CNN.h5', 'XGB.joblib'), ('DNN4.h5', 'DT.joblib'), ('DNN4.h5', 'KNN.joblib'), ('DNN4.h5', 'LR.joblib'), ('DNN4.h5', 'LSTM.h5'), ('DNN4.h5', 'RF.joblib'), ('DNN4.h5', 'SVM.joblib'), ('DNN4.h5', 'XGB.joblib'), ('DT.joblib', 'KNN.joblib'), ('DT.joblib', 'LR.joblib'), ('DT.joblib', 'LSTM.h5'), ('DT.joblib', 'RF.joblib'), ('DT.joblib', 'SVM.joblib'), ('DT.joblib', 'XGB.joblib'), ('KNN.joblib', 'LR.joblib'), ('KNN.joblib', 'LSTM.h5'), ('KNN.joblib', 'RF.joblib'), ('KNN.joblib', 'SVM.joblib'), ('KNN.joblib', 'XGB.joblib'), ('LR.joblib', 'LSTM.h5'), ('LR.joblib', 'RF.joblib'), ('LR.joblib', 'SVM.joblib'), ('LR.joblib', 'XGB.joblib'), ('LSTM.h5', 'RF.joblib'), ('LSTM.h5', 'SVM.joblib'), ('LSTM.h5', 'XGB.joblib'

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


KAPPA STATS: 92.22669349429913
DOUBLE FAULT: 0.5857366420746702
KAPPA STATS: 92.50167672702884
DOUBLE FAULT: 0.9233176838810642
KAPPA STATS: 91.58953722334005
DOUBLE FAULT: 0.7578806170355467
KAPPA STATS: 97.75542141739325
DOUBLE FAULT: 7.49832327297116
KAPPA STATS: 92.45249273418288
DOUBLE FAULT: 1.7057902973395929
KAPPA STATS: 96.55712050078247
DOUBLE FAULT: 4.744019673597139
KAPPA STATS: 97.96109993293092
DOUBLE FAULT: 7.49832327297116


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


KAPPA STATS: 91.32349653476413
DOUBLE FAULT: 0.5857366420746702
KAPPA STATS: 98.2472613458529
DOUBLE FAULT: 0.9188464118041582
KAPPA STATS: 90.94567404426559
DOUBLE FAULT: 0.9255533199195171
KAPPA STATS: 97.04896042924211
DOUBLE FAULT: 0.8361278783813995
KAPPA STATS: 95.39011848871004
DOUBLE FAULT: 0.9926224010731053
KAPPA STATS: 91.15135255980327
DOUBLE FAULT: 0.9255533199195171
KAPPA STATS: 98.22714062150682


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


DOUBLE FAULT: 0.8696624189581935
KAPPA STATS: 90.02906326849988
DOUBLE FAULT: 0.7578806170355467
KAPPA STATS: 97.50055890900961
DOUBLE FAULT: 1.3525598032640287
KAPPA STATS: 94.50033534540577
DOUBLE FAULT: 0.8383635144198525
KAPPA STATS: 90.23474178403755
DOUBLE FAULT: 0.7578806170355467


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


KAPPA STATS: 98.97831433042701
DOUBLE FAULT: 1.5358819584171697
KAPPA STATS: 90.89201877934272
DOUBLE FAULT: 1.7057902973395929
KAPPA STATS: 95.01006036217305
DOUBLE FAULT: 4.750726581712497
KAPPA STATS: 99.45003353454057
DOUBLE FAULT: 9.023027051196065
KAPPA STATS: 89.7674938520009


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


DOUBLE FAULT: 0.5879722781131232
KAPPA STATS: 95.39906103286386
DOUBLE FAULT: 1.8041582830315224
KAPPA STATS: 91.09769729488039
DOUBLE FAULT: 1.7057902973395929


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


KAPPA STATS: 97.71294433266264
DOUBLE FAULT: 1.4196288844176168
KAPPA STATS: 95.2157388777107
DOUBLE FAULT: 4.750726581712497
KAPPA STATS: 94.23429465682986
DOUBLE FAULT: 0.6662195394589762
KAPPA STATS: 89.97317236753857
DOUBLE FAULT: 0.5879722781131232
good_team ['DT.joblib', 'KNN.joblib', 'KNN.joblib', 'LR.joblib', 'XGB.joblib', 'XGB.joblib']
best_team XGB.joblib


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
