In [1]:
import time
from datetime import timedelta
import numpy as np
import pandas as pd
import scipy.linalg as la
import matplotlib.pyplot as plt
from tqdm import tqdm

from joblib import Parallel, delayed
from sklearn import svm, model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin
from sklearn.cluster import KMeans

In [2]:
np.random.seed(7)

Anonimization algorithm

In [3]:
def anonimization(data):
    #calculate the mean of each column
    mean = np.array(np.mean(data, axis=0).T)

    # center data
    data_centered = data - mean

    # calculate the covariance matrix
    cov_matrix = np.cov(data_centered, rowvar=False)
   
    # calculate the eignvalues and eignvectors
    evals, evecs = la.eigh(cov_matrix)

    # sort them
    idx = np.argsort(evals)[::-1]

    # Each columns of this matrix is an eingvector
    evecs = evecs[:,idx]
    evals = evals[idx]

    # explained variance
    variance_retained=np.cumsum(evals)/np.sum(evals)

    # calculate the transformed data
    data_transformed=np.dot(evecs.T, data_centered.T).T

    # randomize eignvectors
    new_evecs = evecs.copy().T
    for i in range(len(new_evecs)):
        np.random.shuffle(new_evecs[i])
    new_evecs = np.array(new_evecs).T

    # go back to the original dimension
    data_original_dimension = np.dot(data_transformed, new_evecs.T) 
    data_original_dimension += mean

    return data_original_dimension

Clusterization Algorithms


In [4]:
def find_clusters(X, k):   
    Kmean = KMeans(n_clusters=k)
    Kmean.fit(X)
    return Kmean.labels_

In [5]:
def anonimization_clustering(data, y, k):
    # generate K data clusters
    clusters = find_clusters(data, k)

    # bucketize the index of each cluster
    indices = dict()
    for i in range(len(clusters)):
        if clusters[i] not in indices.keys():
            indices[ clusters[i] ] = []    
        indices[ clusters[i] ].append(i)

    data_anonymized, y_in_new_order = None, None

    # anonymize each cluster individually
    for k in indices.keys():
        if data_anonymized is None and y_in_new_order is None:
            data_anonymized = anonimization(data[ indices[k] ])
            y_in_new_order = y[ indices[k] ]
            empty_flag = False
        else:
            data_anonymized = np.concatenate((data_anonymized, anonimization(data[ indices[k] ]) ), axis=0)
            y_in_new_order = np.concatenate((y_in_new_order, y[ indices[k] ]), axis=0)

    # data_anonymized = np.concatenate((data_anonymized, np.array([y_in_new_order]).T), axis=1)
    # print(pd.DataFrame(data_anonymized))

    return data_anonymized, y_in_new_order

Cross Validation

In [6]:
def cross_validate_k_fold(X, y, anon_training, anon_test, model, model_name, n_clusters):
    kf = StratifiedKFold(n_splits=3)
    scaler = StandardScaler()

    accuracy, precision, recall, f1 = [], [], [], []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        if anon_training == True:
            X_train, y_train = anonimization_clustering(X_train, y_train, n_clusters)

        if anon_test == True:
            X_test, y_test = anonimization_clustering(X_test, y_test, n_clusters)

        scaler.fit(X_train)
        scaler.fit(X_test)
            
        model.fit(X_train,y_train)

        y_pred = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))

    results = {'accuracy' : np.array(accuracy), 
           'precision' : np.array(precision),
           'recall' : np.array(recall), 
           'f1_score' : np.array(f1)}

    print(model_name, anon_training, anon_test)
    for k in results.keys():
        if k != 'fit_time' and k != 'score_time':
            print(k, '---> mean:', results[k].mean(), ' |  std:', results[k].std())

    return [ anon_training, anon_test, results['accuracy'].mean(), results['precision'].mean(), results['recall'].mean(), results['f1_score'].mean() ]

Read data

In [7]:
dataset = pd.read_csv('df_original_100000.csv')
# dataset = pd.DataFrame([[71, 29, 33, 1], [75, 19, 43, 1], [7, 9, 3, 1], [13, 21, 7, 0], [3, 2, 17, 1]])
# dataset.columns = ['A', 'B', 'C', 'Label']
# print(dataset)
# dataset = anonimization(dataset)

# print(dataset)

y = np.array(dataset['Label'])
del dataset['Label']
X = np.array(dataset)

print(X.shape)

(100000, 79)


In [8]:
def get_results(model, X, y, model_name, n_clusters):
    bol = [True, False]
    results = pd.DataFrame(columns=['anonymized train', 'anonymized test' , 'accuracy', 'precision', 'recall', 'f1_score'])
    for i in range(0, 2):
        for j in range(0, 2):
            new_df = pd.DataFrame([cross_validate_k_fold(X, y, bol[i], bol[j], model, model_name, n_clusters)], columns=results.columns)
            results = pd.concat([results, new_df], ignore_index=True)

    for i in range(2, len(results.columns)):
        col_name = results.columns[i]
        results[col_name] = results[col_name].apply(lambda row:"{:.3%}".format(float(row)))

    return results

In [9]:
# cross_validate_k_fold_knn(X, y, True, True)
# cross_validate_k_fold_knn(X, y, True, False)
# cross_validate_k_fold_knn(X, y, False, True)
# cross_validate_k_fold_knn(X, y, False, False)

In [10]:
results_knn = get_results(KNeighborsClassifier(n_neighbors=5), X, y, 'KNN', 3)
results_knn



KNN True True
accuracy ---> mean: 0.7416797390509454  |  std: 0.04459695912898772
precision ---> mean: 0.6749383140961692  |  std: 0.04241488366069623
recall ---> mean: 0.9450402207604475  |  std: 0.009540616579889616
f1_score ---> mean: 0.7864925648321469  |  std: 0.027087184013988828




KNN True False
accuracy ---> mean: 0.7755500726543835  |  std: 0.005153314511514292
precision ---> mean: 0.7064881453272092  |  std: 0.0035046500687878693
recall ---> mean: 0.9427600959561272  |  std: 0.006326352218465132
f1_score ---> mean: 0.8076992866370057  |  std: 0.004610872502751813




KNN False True
accuracy ---> mean: 0.7873198627588364  |  std: 0.009784493659131208
precision ---> mean: 0.7115239572831635  |  std: 0.002937176879201799
recall ---> mean: 0.9662997175673911  |  std: 0.023444953077220738
f1_score ---> mean: 0.8194873262664583  |  std: 0.010306664199726185
KNN False False
accuracy ---> mean: 0.999990000199996  |  std: 1.4141852786687718e-05
precision ---> mean: 1.0  |  std: 0.0
recall ---> mean: 0.9999800003999919  |  std: 2.8283705573375435e-05
f1_score ---> mean: 0.999989999899999  |  std: 1.4142277046499948e-05


Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,74.168%,67.494%,94.504%,78.649%
1,True,False,77.555%,70.649%,94.276%,80.770%
2,False,True,78.732%,71.152%,96.630%,81.949%
3,False,False,99.999%,100.000%,99.998%,99.999%


In [11]:
results_dtree = get_results(DecisionTreeClassifier(), X, y, 'Decision Tree', 3)
results_dtree



Decision Tree True True
accuracy ---> mean: 0.619630790416022  |  std: 0.16317337657250375
precision ---> mean: 0.6272677695430229  |  std: 0.12880562637224313
recall ---> mean: 0.6984644442476565  |  std: 0.32171218771837923
f1_score ---> mean: 0.6135057450384848  |  std: 0.20973543117445076




Decision Tree True False
accuracy ---> mean: 0.5898915712022662  |  std: 0.18397478692102595
precision ---> mean: 0.6082751294474873  |  std: 0.18861008674495922
recall ---> mean: 0.8312396394577809  |  std: 0.11293060402905283
f1_score ---> mean: 0.6842094252148532  |  std: 0.11964972974416459




Decision Tree False True
accuracy ---> mean: 0.7325398147483604  |  std: 0.047288974852970185
precision ---> mean: 0.6705655230549151  |  std: 0.04366560731674457
recall ---> mean: 0.9267601351441109  |  std: 0.020169644366720953
f1_score ---> mean: 0.7771515769631577  |  std: 0.030258940665762495
Decision Tree False False
accuracy ---> mean: 1.0  |  std: 0.0
precision ---> mean: 1.0  |  std: 0.0
recall ---> mean: 1.0  |  std: 0.0
f1_score ---> mean: 1.0  |  std: 0.0


Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,61.963%,62.727%,69.846%,61.351%
1,True,False,58.989%,60.828%,83.124%,68.421%
2,False,True,73.254%,67.057%,92.676%,77.715%
3,False,False,100.000%,100.000%,100.000%,100.000%


In [12]:
results_rfc = get_results(RandomForestClassifier(n_estimators=100), X, y, 'Random Forest', 3)
results_rfc



Random Forest True True
accuracy ---> mean: 0.6430185127434763  |  std: 0.14804656901924326
precision ---> mean: 0.6602499060702441  |  std: 0.14897929577074537
recall ---> mean: 0.6926964892839429  |  std: 0.19002080563468216
f1_score ---> mean: 0.6568998438862126  |  std: 0.12465225706011512




Random Forest True False
accuracy ---> mean: 0.7511692699575342  |  std: 0.055866354217251564
precision ---> mean: 0.6929446990948156  |  std: 0.056999813384306416
recall ---> mean: 0.9207207935524476  |  std: 0.04476353196372176
f1_score ---> mean: 0.7886900809960636  |  std: 0.03871791602665235




Random Forest False True
accuracy ---> mean: 0.5089598220035719  |  std: 0.012588399148174341
precision ---> mean: 0.5046025486454638  |  std: 0.006467696081929061
recall ---> mean: 0.999859994399776  |  std: 0.00019799781864499082
f1_score ---> mean: 0.6706897070051272  |  std: 0.005719609473246974
Random Forest False False
accuracy ---> mean: 0.999980000099995  |  std: 1.4142064921366732e-05
precision ---> mean: 0.9999600031997439  |  std: 2.828200868679184e-05
recall ---> mean: 1.0  |  std: 0.0
f1_score ---> mean: 0.9999800009999499  |  std: 1.4141428552310986e-05


Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,64.302%,66.025%,69.270%,65.690%
1,True,False,75.117%,69.294%,92.072%,78.869%
2,False,True,50.896%,50.460%,99.986%,67.069%
3,False,False,99.998%,99.996%,100.000%,99.998%


In [13]:
results_gnb = get_results(GaussianNB(var_smoothing=1e-02), X, y, 'GaussianNB', 3)
results_gnb



GaussianNB True True
accuracy ---> mean: 0.7256811000341359  |  std: 0.07779735787003975
precision ---> mean: 0.8035151539935632  |  std: 0.1328112691748339
recall ---> mean: 0.7112848358657452  |  std: 0.33792691272486214
f1_score ---> mean: 0.6675951465557536  |  std: 0.2048959257450276




GaussianNB True False
accuracy ---> mean: 0.7334096512501693  |  std: 0.05701233565499425
precision ---> mean: 0.8028134512345781  |  std: 0.1382187977368999
recall ---> mean: 0.7292515460143215  |  std: 0.298944788696969
f1_score ---> mean: 0.6936860026302316  |  std: 0.1588831098588801




GaussianNB False True
accuracy ---> mean: 0.6410802759254568  |  std: 0.019569893578705
precision ---> mean: 0.9982426695128609  |  std: 0.0004989224914812132
recall ---> mean: 0.2826404782356771  |  std: 0.039093089002359124
f1_score ---> mean: 0.4390730414411534  |  std: 0.048593635632050054
GaussianNB False False
accuracy ---> mean: 0.6410802759254568  |  std: 0.019569893578705
precision ---> mean: 0.9982426695128609  |  std: 0.0004989224914812132
recall ---> mean: 0.2826404782356771  |  std: 0.039093089002359124
f1_score ---> mean: 0.4390730414411534  |  std: 0.048593635632050054


Unnamed: 0,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,True,True,72.568%,80.352%,71.128%,66.760%
1,True,False,73.341%,80.281%,72.925%,69.369%
2,False,True,64.108%,99.824%,28.264%,43.907%
3,False,False,64.108%,99.824%,28.264%,43.907%


In [14]:
results_knn.insert(0, "model", ['KNN', 'KNN', 'KNN', 'KNN'], True)
results_dtree.insert(0, "model", ['Decision Tree', 'Decision Tree', 'Decision Tree', 'Decision Tree'], True)
results_rfc.insert(0, "model", ['Random Forest', 'Random Forest', 'Random Forest', 'Random Forest'], True)
results_gnb.insert(0, "model", ['Gaussian NB', 'Gaussian NB', 'Gaussian NB', 'Gaussian NB'], True)


In [15]:
results = pd.DataFrame(columns=['model', 'anonymized train', 'anonymized test' , 'accuracy', 'precision', 'recall', 'f1_score'])

results = pd.concat([results, results_knn], ignore_index=True)
results = pd.concat([results, results_dtree], ignore_index=True)
results = pd.concat([results, results_rfc], ignore_index=True)
results = pd.concat([results, results_gnb], ignore_index=True)

In [16]:
results

Unnamed: 0,model,anonymized train,anonymized test,accuracy,precision,recall,f1_score
0,KNN,True,True,74.168%,67.494%,94.504%,78.649%
1,KNN,True,False,77.555%,70.649%,94.276%,80.770%
2,KNN,False,True,78.732%,71.152%,96.630%,81.949%
3,KNN,False,False,99.999%,100.000%,99.998%,99.999%
4,Decision Tree,True,True,61.963%,62.727%,69.846%,61.351%
5,Decision Tree,True,False,58.989%,60.828%,83.124%,68.421%
6,Decision Tree,False,True,73.254%,67.057%,92.676%,77.715%
7,Decision Tree,False,False,100.000%,100.000%,100.000%,100.000%
8,Random Forest,True,True,64.302%,66.025%,69.270%,65.690%
9,Random Forest,True,False,75.117%,69.294%,92.072%,78.869%


In [17]:
results_acuracy = dict()
x = []



for k in tqdm(range(3, 105, 1)):
    knn_resul = get_results(KNeighborsClassifier(n_neighbors=k), X, y, 'KNN', k)
    dtree_resul = get_results(DecisionTreeClassifier(), X, y, 'Decision Tree', k)
    rfc_resul = get_results(RandomForestClassifier(n_estimators=100), X, y, 'Random Forest', k)
    gnb_resul = get_results(GaussianNB(var_smoothing=1e-02), X, y, 'GaussianNB', k)

    if 'KNN' not in results_acuracy.keys():
        results_acuracy['KNN'] = []
    if 'Decision Tree' not in results_acuracy.keys():
        results_acuracy['Decision Tree'] = []
    if 'Random Forest' not in results_acuracy.keys():
        results_acuracy['Random Forest'] = []
    if 'GaussianNB' not in results_acuracy.keys():
        results_acuracy['GaussianNB'] = []
    
    results_acuracy['KNN'].append(knn_resul.iloc[0, 2])
    results_acuracy['Decision Tree'].append(dtree_resul.iloc[0, 2])
    results_acuracy['Random Forest'].append(rfc_resul.iloc[0, 2])
    results_acuracy['GaussianNB'].append(gnb_resul.iloc[0, 2])

    x.append(k)
    




KNN True True
accuracy ---> mean: 0.8409807272609235  |  std: 0.09369668454341967
precision ---> mean: 0.8037623547331729  |  std: 0.13853657249166695
recall ---> mean: 0.9439001427579753  |  std: 0.007365476790462604
f1_score ---> mean: 0.8622741405095319  |  std: 0.07812684778536273




KNN True False
accuracy ---> mean: 0.8157404823583244  |  std: 0.11433197686565447
precision ---> mean: 0.777085797435184  |  std: 0.1599861386751312
recall ---> mean: 0.9587607575821603  |  std: 0.02748662016071933
f1_score ---> mean: 0.8485053130427409  |  std: 0.08863324088117874




KNN False True
accuracy ---> mean: 0.7875698622588914  |  std: 0.009844615588334384
precision ---> mean: 0.7116294551146725  |  std: 0.002968591215028512
recall ---> mean: 0.9667997255679511  |  std: 0.023548850722078244
f1_score ---> mean: 0.8197366205097648  |  std: 0.010359432097865233


  0%|          | 0/102 [00:58<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# from joblib import Parallel, delayed

# def process(i):
#     return [i * i, i * i]
    
# results = Parallel(n_jobs=2)(delayed(process)(i) for i in range(10))
# print(results)  # prints [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [None]:
print(results_acuracy)

In [None]:
def plot_resul(results, x):
    # color = ['chartreuse', 'orange', 'firebrick', 'blue']
    color = {'KNN': 'chartreuse', 'GaussianNB': 'blue', 'Decision Tree': 'firebrick', 'Random Forest': 'orange'}
    # marker = ['^', '*', '.', 'o']
    marker = {'KNN': 'o', 'GaussianNB': '.', 'Decision Tree': '*', 'Random Forest': '^'}

    for key in results.keys():
        y = []
        for i in range(len(results[key])):
            y.append(float(results[key][i][0:len(results[key][i])-1]))
        print(key, y)
        plt.plot(x, y, color=color[key], marker=marker[key], label=key)

    plt.ylabel('Accuracy')
    plt.xlabel( 'K of K-Anonymity')
    plt.title('Results')
    plt.legend()
    plt.show()
    # plt.savefig('k_by_accuracy' + '.pdf')

plot_resul(results_acuracy, x)