In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timezone
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
"""
This code is taken straight from: https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file/32216025
"""

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
data_full_arr = np.load('radiflow_datafull.npy')

In [4]:
data_full_rowless_arr = np.load('radiflow_rowless.npy')

In [5]:
unique_vals = load_obj("unique_vals_radiflow")
features = load_obj("radiflow_features")
features_rowless = load_obj("radiflow_rowless_features")

In [6]:
y = data_full_arr[:, -1]
X = data_full_arr[:, 2:]
X = X[:, :-1]
features_red = features[2:-1]

print(X.shape)
print(y.shape)
print("features shape: " + str(len(features_red)))

(40980, 54)
(40980,)
features shape: 54


In [7]:
features_to_remove = ["SrcAddr", "DstAddr"]
indexes_to_remove = []
for f in features_to_remove:
    indexes_to_remove.append(features_red.index(f))
    
removed_features = [x for x in features_red if x not in features_to_remove]

In [8]:
X_red = np.delete(X, indexes_to_remove, 1)
print(X_red.shape)

(40980, 52)


In [9]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
=====================
Classifier comparison
=====================

A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.
This should be taken with a grain of salt, as the intuition conveyed by
these examples does not necessarily carry over to real datasets.

Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.

The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.
"""

# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import matthews_corrcoef, make_scorer
h = .02  # step size in the mesh

names = ["Nearest Neighbors", 
         #"Linear SVM", "RBF SVM", 
         #"Gaussian Process",
         "Decision Tree", 
         "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"
        ]

classifiers = [
    KNeighborsClassifier(3),
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
    ]


# iterate over classifiers
mcc = make_scorer(matthews_corrcoef)
scores = []
for name, clf in zip(names, classifiers):
    
    rcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2)
    scores.extend(cross_val_score(clf, X_red, y, cv=rcv, scoring="accuracy"))
    
    #clf.fit(X_data2019, y_data2019)
    #score = clf.score(X_data2017, y_data2017)
    
    print(name + ": " + str(np.mean(scores)) + " +- " + str(np.var(scores)))
    #print(scores)



Nearest Neighbors: 0.8779892630551489 +- 9.092747453137553e-06
[0.872132747681796, 0.8735968765251342, 0.8804294777940459, 0.878721327476818, 0.881161542215715, 0.8823816495851635, 0.8765251342118107, 0.87701317715959, 0.8823816495851635, 0.879453391898487, 0.8775012201073694, 0.8755490483162518, 0.8726207906295754, 0.8799414348462665, 0.8806734992679356, 0.8804294777940459, 0.8782332845290386, 0.8757930697901415, 0.8799414348462665, 0.8753050268423621]
Decision Tree: 0.8964677891654466 +- 0.0003538519411914202
[0.872132747681796, 0.8735968765251342, 0.8804294777940459, 0.878721327476818, 0.881161542215715, 0.8823816495851635, 0.8765251342118107, 0.87701317715959, 0.8823816495851635, 0.879453391898487, 0.8775012201073694, 0.8755490483162518, 0.8726207906295754, 0.8799414348462665, 0.8806734992679356, 0.8804294777940459, 0.8782332845290386, 0.8757930697901415, 0.8799414348462665, 0.8753050268423621, 0.9109321620302586, 0.9163006344558321, 0.9136163982430454, 0.917276720351391, 0.9114202



QDA: 0.7569999302795788 +- 0.08597841483129469
[0.872132747681796, 0.8735968765251342, 0.8804294777940459, 0.878721327476818, 0.881161542215715, 0.8823816495851635, 0.8765251342118107, 0.87701317715959, 0.8823816495851635, 0.879453391898487, 0.8775012201073694, 0.8755490483162518, 0.8726207906295754, 0.8799414348462665, 0.8806734992679356, 0.8804294777940459, 0.8782332845290386, 0.8757930697901415, 0.8799414348462665, 0.8753050268423621, 0.9109321620302586, 0.9163006344558321, 0.9136163982430454, 0.917276720351391, 0.9114202049780381, 0.915568570034163, 0.9148365056124939, 0.9184968277208394, 0.9143484626647145, 0.9133723767691557, 0.915568570034163, 0.9133723767691557, 0.9109321620302586, 0.9233772571986335, 0.9070278184480234, 0.9141044411908248, 0.9248413860419717, 0.9165446559297218, 0.9123962908735969, 0.9145924841386042, 0.9009272816007808, 0.8992191312835529, 0.9048316251830161, 0.9011713030746705, 0.9014153245485603, 0.9070278184480234, 0.9033674963396779, 0.9033674963396779, 0



In [16]:
from sklearn.tree import DecisionTreeClassifier
"""
Get inbuilt feature importance 100 times due to small variations in importance
"""
def use_inbuilt_feature_imp(claf, names, X, y):
    feature_imp_dict = {}
    for name, clf in zip(names, claf):
        for ip in range(0,1000):
            clf.fit(X, y)
            sorted_importances = []
            index =0
            try:
                for i in clf.feature_importances_:
                    sorted_importances.append((index, i))
                    index+=1
            except AttributeError:
                for i in clf.coef_[0]:
                    sorted_importances.append((index, i))
                    index+=1
            sorted_importances.sort(key = lambda tup : tup[1], reverse=True)
            feature_imp_dict[name + str(ip)] = sorted_importances
    return feature_imp_dict

clf=[DecisionTreeClassifier(), RandomForestClassifier()]
names = ["Decision Tree", "Random Forest"]

feature_imp_dict = use_inbuilt_feature_imp(clf, names, X_red, y)

In [17]:
import math

"""
Sort by feature importance for each classifier
"""
def sort_inbuilt_feature_imp(feature_ranks):
    for clf, res in feature_ranks.items():
        res.sort(key=lambda tup: abs(tup[1]), reverse=True)
        #print(abs(res[0][1]))
        feature_ranks[clf] = res
    return feature_ranks
        #print(res)
"""
Takes sorted feature importance dicts and replaces the score with a rank 
"""
def replace_score_with_rank(feature_ranks):
    ranks = {}
    for clf, result in feature_ranks.items():
        result_ranks = []
        for i in range(0, len(result)):
            if result[i][1] == 0.0:
                element = (result[i][0], len(result)-1)
            else:   
                element = (result[i][0], i)
            result_ranks.append(element)
        ranks[clf] = result_ranks
    return ranks

"""
Gets averages for inbuilt feature classification (as that was run 100 times for each classifier to accomadate for
any variations)
"""
def get_averages(feature_imp_dict, classifier):
    average_feature_imp = {}
    average_feature_list = []
    keysSeen = 0
    firstTime = True
    for key, value in feature_imp_dict.items():
        if classifier in key:
            if firstTime:
                #print(key)
                firstTime=False
            keysSeen +=1
            for tup in value:
                if tup[0] in average_feature_imp: 
                    average_feature_imp[tup[0]] += tup[1]
                else:
                    average_feature_imp[tup[0]] = tup[1]
    for key, value in average_feature_imp.items():
        average_feature_list.append((key, value/keysSeen))
    average_feature_list.sort(key=lambda tup : abs(tup[1]), reverse=True)
    return average_feature_list

def getAveragesForAllClfs(feature_imp_dict, clfs):
    average_feature_imp_dict = {}
    for clf in clfs:
        average_feature_imp_dict[clf] = get_averages(feature_imp_dict, clf)
    return average_feature_imp_dict

feature_imp_dict = sort_inbuilt_feature_imp(feature_imp_dict)
#feature_rank_dict =  replace_score_with_rank(feature_imp_dict)
average_feature_rank_dict = getAveragesForAllClfs(feature_imp_dict, ["Decision Tree", "Random Forest"])

index = 0
for name, features in average_feature_rank_dict.items():
    index = 0
    print("")
    print(name)
    for feat in features:
        #print(hook_vocab_combined[feat[0]] + ": " + str(feat[1]))
        print(removed_features[feat[0]] + ": " +  str(feat[1]))
        index+=1



Decision Tree
Sport: 0.26655126003466945
Dport: 0.12086191097009007
Offset: 0.11666953760949991
dMeanPktSz: 0.08143735660937144
SrcRate: 0.03974675311924701
Load: 0.03684986009234983
DstLoad: 0.031697273096533723
Rate: 0.027492564384940778
DstLoss: 0.026089771380736964
SrcLoad: 0.0213997588826786
pLoss: 0.020880179392896096
sMeanPktSz: 0.02044797395121968
TotBytes: 0.018524864503283653
Sum: 0.016651632533014864
RunTime: 0.016644653001193402
Mean: 0.016605451360213134
Min: 0.016595226917715342
Max: 0.016592253008901048
Dur: 0.016580666601479818
DstRate: 0.011703621530310656
SynAck: 0.010036977025197631
AckDat: 0.009650021780046055
TotPkts: 0.008040528137737014
SrcBytes: 0.0076807713807397855
TcpRtt: 0.006633811835045655
INT: 0.004184602642789478
DstBytes: 0.0021830151663465715
Proto: 0.001931427134576
SrcPkts: 0.0017488454316505167
Loss: 0.0015501660197747404
DstPkts: 0.0012652639859799233
FIN: 0.001198638871115868
Flg-d: 0.0009020746709179322
SrcLoss: 0.0008482901424179219
CON: 0.0005