In [1]:
import time
from datetime import timedelta
import numpy as np
import pandas as pd
import scipy.linalg as la
import matplotlib.pyplot as plt

from sklearn import svm, model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin
from sklearn.cluster import KMeans

In [2]:
np.random.seed(7)

Anonimization algorithm

In [3]:
def anonimization(data):
    #calculate the mean of each column
    mean = np.array(np.mean(data, axis=0).T)

    # center data
    data_centered = data - mean

    # calculate the covariance matrix
    cov_matrix = np.cov(data_centered, rowvar=False)
   
    # calculate the eignvalues and eignvectors
    evals, evecs = la.eigh(cov_matrix)

    # sort them
    idx = np.argsort(evals)[::-1]

    # Each columns of this matrix is an eingvector
    evecs = evecs[:,idx]
    evals = evals[idx]

    # explained variance
    variance_retained=np.cumsum(evals)/np.sum(evals)

    # calculate the transformed data
    data_transformed=np.dot(evecs.T, data_centered.T).T

    # randomize eignvectors
    new_evecs = evecs.copy().T
    for i in range(len(new_evecs)):
        np.random.shuffle(new_evecs[i])
    new_evecs = np.array(new_evecs).T

    # go back to the original dimension
    data_original_dimension = np.dot(data_transformed, new_evecs.T) 
    data_original_dimension += mean

    return data_original_dimension

Clusterization Algorithms


In [4]:
def find_clusters(X, k):   
    Kmean = KMeans(n_clusters=k)
    Kmean.fit(X)
    return Kmean.labels_

In [5]:
def anonimization_clustering(data, y, k):
    # generate K data clusters
    clusters = find_clusters(data, k)

    # bucketize the index of each cluster
    indices = dict()
    for i in range(len(clusters)):
        if clusters[i] not in indices.keys():
            indices[ clusters[i] ] = []    
        indices[ clusters[i] ].append(i)

    data_anonymized, y_in_new_order = None, None

    # anonymize each cluster individually
    for k in indices.keys():
        if data_anonymized is None and y_in_new_order is None:
            data_anonymized = anonimization(data[ indices[k] ])
            y_in_new_order = y[ indices[k] ]
            empty_flag = False
        else:
            data_anonymized = np.concatenate((data_anonymized, anonimization(data[ indices[k] ]) ), axis=0)
            y_in_new_order = np.concatenate((y_in_new_order, y[ indices[k] ]), axis=0)

    # data_anonymized = np.concatenate((data_anonymized, np.array([y_in_new_order]).T), axis=1)
    # print(pd.DataFrame(data_anonymized))

    return data_anonymized, y_in_new_order

Cross Validation

In [6]:
def cross_validate_k_fold(X, y, anon_training, anon_test, model, model_name):
    kf = StratifiedKFold(n_splits=10)
    scaler = StandardScaler()

    accuracy, precision, recall, f1 = [], [], [], []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        if anon_training == True:
            X_train, y_train = anonimization_clustering(X_train, y_train, 3)

        if anon_test == True:
            X_test, y_test = anonimization_clustering(X_test, y_test, 3)

        scaler.fit(X_train)
        scaler.fit(X_test)
            
        model.fit(X_train,y_train)

        y_pred = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))

    results = {'accuracy' : np.array(accuracy), 
           'precision' : np.array(precision),
           'recall' : np.array(recall), 
           'f1_score' : np.array(f1)}

    print(model_name, anon_training, anon_test)
    for k in results.keys():
        if k != 'fit_time' and k != 'score_time':
            print(k, '---> mean:', results[k].mean(), ' |  std:', results[k].std())

    return [ anon_training, anon_test, results['accuracy'].mean(), results['precision'].mean(), results['recall'].mean(), results['f1_score'].mean() ]

Read data

In [7]:
dataset = pd.read_csv('df_original_100000.csv')
# dataset = pd.DataFrame([[71, 29, 33, 1], [75, 19, 43, 1], [7, 9, 3, 1], [13, 21, 7, 0], [3, 2, 17, 1]])
# dataset.columns = ['A', 'B', 'C', 'Label']
# print(dataset)
# dataset = anonimization(dataset)

print(dataset)

y = np.array(dataset['Label'])
del dataset['Label']
X = np.array(dataset)

print(X.shape)

       Src Port  Dst Port  Protocol  Flow Duration  Total Fwd Packet  \
0          4425     49153         6           6434                 3   
1          4425     49153         6           1173                 2   
2          4426     49153         6           6498                 3   
3          4426     49153         6           1217                 2   
4          4977     49152         6           9928                 3   
...         ...       ...       ...            ...               ...   
99995      9616        80         6       23061097                 7   
99996     51882        80         6       13414419                 3   
99997     51297        80         6       16747410                 6   
99998     30418        80         6       15701229                 4   
99999     28432        80         6       19826733                 6   

       Total Bwd packets  Total Length of Fwd Packet  \
0                      4                       196.0   
1                      

In [8]:
def get_results(model, X, y, model_name):    
    bol = [True, False]
    results = pd.DataFrame(columns=['anonymized train', 'anonymized test' , 'accuracy', 'precision', 'recall', 'f1_score'])
    for i in range(0, 2):
        for j in range(0, 2):
            new_df = pd.DataFrame([cross_validate_k_fold(X, y, bol[i], bol[j], model, model_name)], columns=results.columns)
            results = pd.concat([results, new_df], ignore_index=True)

    for i in range(2, len(results.columns)):
        col_name = results.columns[i]
        results[col_name] = results[col_name].apply(lambda row:"{:.3%}".format(float(row)))

    return results

In [9]:
# cross_validate_k_fold_knn(X, y, True, True)
# cross_validate_k_fold_knn(X, y, True, False)
# cross_validate_k_fold_knn(X, y, False, True)
# cross_validate_k_fold_knn(X, y, False, False)

In [10]:
results_knn = get_results(KNeighborsClassifier(n_neighbors=5), X, y, 'KNN')
results_knn

KNN True True
accuracy ---> mean: 0.79502  |  std: 0.09604782975163988
precision ---> mean: 0.7463966238899781  |  std: 0.1316608404022366
recall ---> mean: 0.94608  |  std: 0.02177157780226322
f1_score ---> mean: 0.8281995361351561  |  std: 0.07485243785893606
KNN True False
accuracy ---> mean: 0.76621  |  std: 0.03777135025386305
precision ---> mean: 0.6932343451861119  |  std: 0.035309150317108746
recall ---> mean: 0.96286  |  std: 0.022348521203873872
f1_score ---> mean: 0.805350763991572  |  std: 0.024182185277977886
KNN False True
accuracy ---> mean: 0.79581  |  std: 0.0683956497154607
precision ---> mean: 0.7347211807573197  |  std: 0.0884757058986303
recall ---> mean: 0.9445600000000001  |  std: 0.022405856377295665
f1_score ---> mean: 0.8247374852726784  |  std: 0.058761952303709786
KNN False False
accuracy ---> mean: 0.99999  |  std: 2.9999999999996697e-05
precision ---> mean: 1.0  |  std: 0.0
recall ---> mean: 0.9999800000000001  |  std: 5.9999999999993395e-05
f1_score ---> 

Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,79.502%,74.640%,94.608%,82.820%
1,True,False,76.621%,69.323%,96.286%,80.535%
2,False,True,79.581%,73.472%,94.456%,82.474%
3,False,False,99.999%,100.000%,99.998%,99.999%


In [11]:
results_dtree = get_results(DecisionTreeClassifier(), X, y, 'Decision Tree')
results_dtree

Decision Tree True True
accuracy ---> mean: 0.6003299999999999  |  std: 0.15174126696452747
precision ---> mean: 0.6697858624670632  |  std: 0.17049533259231178
recall ---> mean: 0.5473600000000001  |  std: 0.3071196483457221
f1_score ---> mean: 0.5342776741638764  |  std: 0.22636929191850566


  _warn_prf(average, modifier, msg_start, len(result))


Decision Tree True False
accuracy ---> mean: 0.55085  |  std: 0.15296194461368487
precision ---> mean: 0.5803120261917069  |  std: 0.29940354555431714
recall ---> mean: 0.67472  |  std: 0.3227137208114957
f1_score ---> mean: 0.5592531741964047  |  std: 0.2244406583691458
Decision Tree False True
accuracy ---> mean: 0.75001  |  std: 0.04141481498208098
precision ---> mean: 0.7168467232803224  |  std: 0.09809209884854839
recall ---> mean: 0.8735799999999999  |  std: 0.10046497698203091
f1_score ---> mean: 0.7769707256591067  |  std: 0.03248608878427555
Decision Tree False False
accuracy ---> mean: 0.99999  |  std: 2.9999999999996697e-05
precision ---> mean: 1.0  |  std: 0.0
recall ---> mean: 0.9999800000000001  |  std: 5.9999999999993395e-05
f1_score ---> mean: 0.9999899989998999  |  std: 3.000300030003844e-05


Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,60.033%,66.979%,54.736%,53.428%
1,True,False,55.085%,58.031%,67.472%,55.925%
2,False,True,75.001%,71.685%,87.358%,77.697%
3,False,False,99.999%,100.000%,99.998%,99.999%


In [12]:
results_rfc = get_results(RandomForestClassifier(n_estimators=100), X, y, 'Random Forest')
results_rfc

Random Forest True True
accuracy ---> mean: 0.71377  |  std: 0.06441172331183197
precision ---> mean: 0.6423758388294296  |  std: 0.05414992366732891
recall ---> mean: 0.98942  |  std: 0.015157163322996825
f1_score ---> mean: 0.777588024373091  |  std: 0.03942376065869718
Random Forest True False
accuracy ---> mean: 0.6577900000000001  |  std: 0.14393319596257148
precision ---> mean: 0.6358467531053485  |  std: 0.13752926204555915
recall ---> mean: 0.8765799999999999  |  std: 0.10117511354083078
f1_score ---> mean: 0.7268680362544926  |  std: 0.09712742284320093
Random Forest False True
accuracy ---> mean: 0.5626800000000001  |  std: 0.04694969222476331
precision ---> mean: 0.5355871383037124  |  std: 0.02918709099617974
recall ---> mean: 0.9922599999999999  |  std: 0.02026584318502441
f1_score ---> mean: 0.6949143694971337  |  std: 0.02118587696918723
Random Forest False False
accuracy ---> mean: 0.9999800000000001  |  std: 3.99999999999956e-05
precision ---> mean: 0.9999600079984002 

Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,71.377%,64.238%,98.942%,77.759%
1,True,False,65.779%,63.585%,87.658%,72.687%
2,False,True,56.268%,53.559%,99.226%,69.491%
3,False,False,99.998%,99.996%,100.000%,99.998%


In [13]:
results_gnb = get_results(GaussianNB(var_smoothing=1e-02), X, y, 'GaussianNB')
results_gnb

GaussianNB True True
accuracy ---> mean: 0.77655  |  std: 0.013055822455900654
precision ---> mean: 0.7068511590053486  |  std: 0.0071728925206414815
recall ---> mean: 0.9447599999999999  |  std: 0.022041923691003024
f1_score ---> mean: 0.8086305193454439  |  std: 0.012503738356563859
GaussianNB True False
accuracy ---> mean: 0.77655  |  std: 0.013055822455900654
precision ---> mean: 0.7068511590053486  |  std: 0.0071728925206414815
recall ---> mean: 0.9447599999999999  |  std: 0.022041923691003024
f1_score ---> mean: 0.8086305193454439  |  std: 0.012503738356563859
GaussianNB False True
accuracy ---> mean: 0.64108  |  std: 0.0605414205317318
precision ---> mean: 0.9980593043818443  |  std: 0.0014991799227678214
recall ---> mean: 0.28264  |  std: 0.1210265028826331
f1_score ---> mean: 0.42908444626490505  |  std: 0.12186315645664125
GaussianNB False False
accuracy ---> mean: 0.64108  |  std: 0.0605414205317318
precision ---> mean: 0.9980593043818443  |  std: 0.0014991799227678214
recal

Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,77.655%,70.685%,94.476%,80.863%
1,True,False,77.655%,70.685%,94.476%,80.863%
2,False,True,64.108%,99.806%,28.264%,42.908%
3,False,False,64.108%,99.806%,28.264%,42.908%


In [14]:
results_knn.insert(0, "model", ['KNN', 'KNN', 'KNN', 'KNN'], True)
results_dtree.insert(0, "model", ['Decision Tree', 'Decision Tree', 'Decision Tree', 'Decision Tree'], True)
results_rfc.insert(0, "model", ['Random Forest', 'Random Forest', 'Random Forest', 'Random Forest'], True)
results_gnb.insert(0, "model", ['Gaussian NB', 'Gaussian NB', 'Gaussian NB', 'Gaussian NB'], True)


In [15]:
results = pd.DataFrame(columns=['model', 'anonymized train', 'anonymized test' , 'accuracy', 'precision', 'recall', 'f1_score'])

results = pd.concat([results, results_knn], ignore_index=True)
results = pd.concat([results, results_dtree], ignore_index=True)
results = pd.concat([results, results_rfc], ignore_index=True)
results = pd.concat([results, results_gnb], ignore_index=True)

In [16]:
results

Unnamed: 0,model,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,KNN,True,True,79.502%,74.640%,94.608%,82.820%
1,KNN,True,False,76.621%,69.323%,96.286%,80.535%
2,KNN,False,True,79.581%,73.472%,94.456%,82.474%
3,KNN,False,False,99.999%,100.000%,99.998%,99.999%
4,Decision Tree,True,True,60.033%,66.979%,54.736%,53.428%
5,Decision Tree,True,False,55.085%,58.031%,67.472%,55.925%
6,Decision Tree,False,True,75.001%,71.685%,87.358%,77.697%
7,Decision Tree,False,False,99.999%,100.000%,99.998%,99.999%
8,Random Forest,True,True,71.377%,64.238%,98.942%,77.759%
9,Random Forest,True,False,65.779%,63.585%,87.658%,72.687%
