In [1]:
import time
from datetime import timedelta
import numpy as np
import pandas as pd
import scipy.linalg as la
import matplotlib.pyplot as plt

from sklearn import svm, model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin
from sklearn.cluster import KMeans

Anonimization algorithm

In [2]:
def anonimization(data):
    # center data
    data_centered = data - np.mean(data, axis=0)

    # calculate the covariance matrix
    cov_matrix = np.cov(data_centered, rowvar=False)
   
    # calculate the eignvalues and eignvectors
    evals, evecs = la.eigh(cov_matrix)

    # sort them
    idx = np.argsort(evals)[::-1]

    # Each columns of this matrix is an eingvector
    evecs = evecs[:,idx]
    evals = evals[idx]

    # explained variance
    variance_retained=np.cumsum(evals)/np.sum(evals)

    # calculate the transformed data
    data_transformed=np.dot(evecs.T, data_centered.T).T

    # generate random eignvectors
    new_evecs = []
    for j in range(len(evecs[0])):
        v = np.array(evecs[ : , j:j+1 ])
        u = np.random.normal(loc=v.mean(axis=0), scale=v.std(axis=0), size=len(v))
        new_evecs.append(u)
    new_evecs = np.array(new_evecs).T

    # go back to the original dimension
    data_original_dimension = np.dot(data_transformed, new_evecs.T) 
    data_original_dimension += np.mean(X, axis=0)

    return data_original_dimension

Clusterization Algorithms


In [3]:
def find_clusters(X, k):   
    Kmean = KMeans(n_clusters=k)
    Kmean.fit(X)
    return Kmean.labels_

In [4]:
def anonimization_clustering(data, y, k):
    # generate K data clusters
    clusters = find_clusters(data, k)

    # bucketize the index of each cluster
    indices = dict()
    for i in range(len(clusters)):
        if clusters[i] not in indices.keys():
            indices[ clusters[i] ] = []    
        indices[ clusters[i] ].append(i)

    data_anonymized, y_in_new_order = None, None

    # anonymize each cluster individually
    for k in indices.keys():
        if data_anonymized is None and y_in_new_order is None:
            data_anonymized = anonimization(data[ indices[k] ])
            y_in_new_order = y[ indices[k] ]
            empty_flag = False
        else:
            data_anonymized = np.concatenate((data_anonymized, anonimization(data[ indices[k] ]) ), axis=0)
            y_in_new_order = np.concatenate((y_in_new_order, y[ indices[k] ]), axis=0)

    # data_anonymized = np.concatenate((data_anonymized, np.array([y_in_new_order]).T), axis=1)
    # print(pd.DataFrame(data_anonymized))

    return data_anonymized, y_in_new_order

Cross Validation KNN

In [5]:
# def cross_validate_k_fold_knn(X, y, anon_training, anon_test):
#     kf = StratifiedKFold(n_splits=10)

#     accuracy, precision, recall, f1 = [], [], [], []

#     for train_index, test_index in kf.split(X, y):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         if anon_training == True:
#             X_train, y_train = anonimization_clustering(X_train, y_train, 5)

#         if anon_test == True:
#             X_test, y_test = anonimization_clustering(X_test, y_test, 5)

#         knn_model = KNeighborsClassifier(n_neighbors=3)
#         knn_model.fit(X_train,y_train)

#         y_pred = knn_model.predict(X_test)

#         accuracy.append(accuracy_score(y_test, y_pred))
#         precision.append(precision_score(y_test, y_pred))
#         recall.append(recall_score(y_test, y_pred))
#         f1.append(f1_score(y_test, y_pred))

#     results = {'accuracy' : np.array(accuracy), 
#            'precision' : np.array(precision),
#            'recall' : np.array(recall), 
#            'f1_score' : np.array(f1)}

#     print('KNN', anon_training, anon_test)
#     for k in results.keys():
#         if k != 'fit_time' and k != 'score_time':
#             print(k, '---> mean:', results[k].mean(), ' |  std:', results[k].std())
#     print()

Cross Validation

In [6]:
def cross_validate_k_fold(X, y, anon_training, anon_test, model, model_name):
    kf = StratifiedKFold(n_splits=10)
    scaler = StandardScaler()

    accuracy, precision, recall, f1 = [], [], [], []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        if anon_training == True:
            X_train, y_train = anonimization_clustering(X_train, y_train, 5)

        if anon_test == True:
            X_test, y_test = anonimization_clustering(X_test, y_test, 5)

        scaler.fit(X_train)
        scaler.fit(X_test)
            
        model.fit(X_train,y_train)

        y_pred = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))

    results = {'accuracy' : np.array(accuracy), 
           'precision' : np.array(precision),
           'recall' : np.array(recall), 
           'f1_score' : np.array(f1)}

    print(model_name, anon_training, anon_test)
    for k in results.keys():
        if k != 'fit_time' and k != 'score_time':
            print(k, '---> mean:', results[k].mean(), ' |  std:', results[k].std())

    return [ anon_training, anon_test, results['accuracy'].mean(), results['precision'].mean(), results['recall'].mean(), results['f1_score'].mean() ]

Read data

In [7]:
dataset = pd.read_csv('df_original_100000.csv')
# dataset = pd.DataFrame([[71, 29, 33, 1], [75, 19, 43, 1], [7, 9, 3, 1], [13, 21, 7, 0], [3, 2, 17, 1]])
# dataset.columns = ['A', 'B', 'C', 'Label']
# print(dataset)

y = np.array(dataset['Label'])
del dataset['Label']
X = np.array(dataset)

print(X.shape)

(100000, 79)


In [8]:
def get_results(model, X, y, model_name):    
    bol = [True, False]
    results = pd.DataFrame(columns=['anonymized train', 'anonymized test' , 'accuracy', 'precision', 'recall', 'f1_score'])
    for i in range(0, 2):
        for j in range(0, 2):
            new_df = pd.DataFrame([cross_validate_k_fold(X, y, bol[i], bol[j], model, model_name)], columns=results.columns)
            results = pd.concat([results, new_df], ignore_index=True)

    for i in range(2, len(results.columns)):
        col_name = results.columns[i]
        results[col_name] = results[col_name].apply(lambda row:"{:.3%}".format(float(row)))

    return results

In [9]:
# cross_validate_k_fold_knn(X, y, True, True)
# cross_validate_k_fold_knn(X, y, True, False)
# cross_validate_k_fold_knn(X, y, False, True)
# cross_validate_k_fold_knn(X, y, False, False)

In [10]:
results_knn = get_results(KNeighborsClassifier(n_neighbors=5), X, y, 'KNN')
results_knn

KNN True True
accuracy ---> mean: 0.72027  |  std: 0.1685726789844665
precision ---> mean: 0.6719051312790343  |  std: 0.15030519390079422
recall ---> mean: 0.7942199999999999  |  std: 0.284635351985659
f1_score ---> mean: 0.7164787338493651  |  std: 0.21344639537405838
KNN True False
accuracy ---> mean: 0.5027699999999999  |  std: 0.1467384751862987
precision ---> mean: 0.5038311045100249  |  std: 0.20959451740066928
recall ---> mean: 0.35374  |  std: 0.23084692850458283
f1_score ---> mean: 0.39709734465394786  |  std: 0.1932024212230583
KNN False True
accuracy ---> mean: 0.5  |  std: 0.0
precision ---> mean: 0.5  |  std: 0.0
recall ---> mean: 1.0  |  std: 0.0
f1_score ---> mean: 0.6666666666666667  |  std: 1.1102230246251565e-16
KNN False False
accuracy ---> mean: 0.99999  |  std: 2.9999999999996697e-05
precision ---> mean: 1.0  |  std: 0.0
recall ---> mean: 0.9999800000000001  |  std: 5.9999999999993395e-05
f1_score ---> mean: 0.9999899989998999  |  std: 3.0003000300038437e-05


Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,72.027%,67.191%,79.422%,71.648%
1,True,False,50.277%,50.383%,35.374%,39.710%
2,False,True,50.000%,50.000%,100.000%,66.667%
3,False,False,99.999%,100.000%,99.998%,99.999%


In [11]:
results_dtree = get_results(DecisionTreeClassifier(), X, y, 'Decision Tree')
results_dtree

Decision Tree True True
accuracy ---> mean: 0.4849400000000001  |  std: 0.26304513757148223
precision ---> mean: 0.48781332628620006  |  std: 0.2621864888285082
recall ---> mean: 0.48636000000000007  |  std: 0.33412887094652566
f1_score ---> mean: 0.46735035836948935  |  std: 0.27300980075287123
Decision Tree True False
accuracy ---> mean: 0.48617  |  std: 0.042026040736667075
precision ---> mean: 0.4919473529464374  |  std: 0.02442620344644373
recall ---> mean: 0.97202  |  std: 0.08394000000000003
f1_score ---> mean: 0.6530231758683092  |  std: 0.04116830963219659
Decision Tree False True
accuracy ---> mean: 0.63085  |  std: 0.18217473068458206
precision ---> mean: 0.585077407806742  |  std: 0.1385510700758563
recall ---> mean: 0.9143399999999999  |  std: 0.20921956027102248
f1_score ---> mean: 0.7102352940772563  |  std: 0.16048446998059496
Decision Tree False False
accuracy ---> mean: 0.99999  |  std: 2.9999999999996697e-05
precision ---> mean: 1.0  |  std: 0.0
recall ---> mean: 0.9

Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,48.494%,48.781%,48.636%,46.735%
1,True,False,48.617%,49.195%,97.202%,65.302%
2,False,True,63.085%,58.508%,91.434%,71.024%
3,False,False,99.999%,100.000%,99.998%,99.999%


In [12]:
results_rfc = get_results(RandomForestClassifier(n_estimators=100), X, y, 'Random Forest')
results_rfc

Random Forest True True
accuracy ---> mean: 0.6041400000000001  |  std: 0.17084727799997285
precision ---> mean: 0.5674091762824853  |  std: 0.15805496068857902
recall ---> mean: 0.53562  |  std: 0.347278095479689
f1_score ---> mean: 0.5274481995554255  |  std: 0.25237360972616935
Random Forest True False
accuracy ---> mean: 0.5  |  std: 0.0
precision ---> mean: 0.5  |  std: 0.0
recall ---> mean: 1.0  |  std: 0.0
f1_score ---> mean: 0.6666666666666667  |  std: 1.1102230246251565e-16
Random Forest False True
accuracy ---> mean: 0.5  |  std: 0.0
precision ---> mean: 0.5  |  std: 0.0
recall ---> mean: 1.0  |  std: 0.0
f1_score ---> mean: 0.6666666666666667  |  std: 1.1102230246251565e-16
Random Forest False False
accuracy ---> mean: 0.9999800000000001  |  std: 3.99999999999956e-05
precision ---> mean: 0.9999600079984002  |  std: 7.99840031993515e-05
recall ---> mean: 1.0  |  std: 0.0
f1_score ---> mean: 0.9999800019998  |  std: 3.999600039996665e-05


Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,60.414%,56.741%,53.562%,52.745%
1,True,False,50.000%,50.000%,100.000%,66.667%
2,False,True,50.000%,50.000%,100.000%,66.667%
3,False,False,99.998%,99.996%,100.000%,99.998%


In [13]:
results_gnb = get_results(GaussianNB(var_smoothing=1e-02), X, y, 'GaussianNB')
results_gnb

GaussianNB True True
accuracy ---> mean: 0.5589299999999999  |  std: 0.20251609343457128
precision ---> mean: 0.5206977004604523  |  std: 0.16082838837122718
recall ---> mean: 0.73594  |  std: 0.325820564728502
f1_score ---> mean: 0.6009758394284055  |  std: 0.22412666462220265


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GaussianNB True False
accuracy ---> mean: 0.5331300000000001  |  std: 0.07885713728002051
precision ---> mean: 0.27359979089814246  |  std: 0.27884685685181915
recall ---> mean: 0.4648  |  std: 0.46909481344393483
f1_score ---> mean: 0.342259670395587  |  std: 0.3450584877596488
GaussianNB False True
accuracy ---> mean: 0.49992000000000003  |  std: 0.0002400000000000069
precision ---> mean: 0.4999599679743795  |  std: 0.00012009607686148672
recall ---> mean: 0.9998400000000001  |  std: 0.0004800000000000138
f1_score ---> mean: 0.6665955176093916  |  std: 0.00021344717182496533
GaussianNB False False
accuracy ---> mean: 0.64108  |  std: 0.0605414205317318
precision ---> mean: 0.9980593043818443  |  std: 0.0014991799227678214
recall ---> mean: 0.28264  |  std: 0.1210265028826331
f1_score ---> mean: 0.42908444626490505  |  std: 0.12186315645664125


Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,55.893%,52.070%,73.594%,60.098%
1,True,False,53.313%,27.360%,46.480%,34.226%
2,False,True,49.992%,49.996%,99.984%,66.660%
3,False,False,64.108%,99.806%,28.264%,42.908%


In [14]:
results_knn.insert(0, "model", ['KNN', 'KNN', 'KNN', 'KNN'], True)
results_dtree.insert(0, "model", ['Decision Tree', 'Decision Tree', 'Decision Tree', 'Decision Tree'], True)
results_rfc.insert(0, "model", ['Random Forest', 'Random Forest', 'Random Forest', 'Random Forest'], True)
results_gnb.insert(0, "model", ['Gaussian NB', 'Gaussian NB', 'Gaussian NB', 'Gaussian NB'], True)


In [15]:
results = pd.DataFrame(columns=['model', 'anonymized train', 'anonymized test' , 'accuracy', 'precision', 'recall', 'f1_score'])

results = pd.concat([results, results_knn], ignore_index=True)
results = pd.concat([results, results_dtree], ignore_index=True)
results = pd.concat([results, results_rfc], ignore_index=True)
results = pd.concat([results, results_gnb], ignore_index=True)

In [16]:
results


Unnamed: 0,model,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,KNN,True,True,72.027%,67.191%,79.422%,71.648%
1,KNN,True,False,50.277%,50.383%,35.374%,39.710%
2,KNN,False,True,50.000%,50.000%,100.000%,66.667%
3,KNN,False,False,99.999%,100.000%,99.998%,99.999%
4,Decision Tree,True,True,48.494%,48.781%,48.636%,46.735%
5,Decision Tree,True,False,48.617%,49.195%,97.202%,65.302%
6,Decision Tree,False,True,63.085%,58.508%,91.434%,71.024%
7,Decision Tree,False,False,99.999%,100.000%,99.998%,99.999%
8,Random Forest,True,True,60.414%,56.741%,53.562%,52.745%
9,Random Forest,True,False,50.000%,50.000%,100.000%,66.667%
