In [None]:
import time
from datetime import timedelta
import numpy as np
import pandas as pd
import scipy.linalg as la
import matplotlib.pyplot as plt
from tqdm import tqdm

from joblib import Parallel, delayed
from sklearn import svm, model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin
from sklearn.cluster import KMeans

In [None]:
np.random.seed(7)

Anonimization algorithm

In [None]:
def anonimization(data):
    #calculate the mean of each column
    mean = np.array(np.mean(data, axis=0).T)

    # center data
    data_centered = data - mean

    # calculate the covariance matrix
    cov_matrix = np.cov(data_centered, rowvar=False)
   
    # calculate the eignvalues and eignvectors
    evals, evecs = la.eigh(cov_matrix)

    # sort them
    idx = np.argsort(evals)[::-1]

    # Each columns of this matrix is an eingvector
    evecs = evecs[:,idx]
    evals = evals[idx]

    # explained variance
    variance_retained=np.cumsum(evals)/np.sum(evals)

    # calculate the transformed data
    data_transformed=np.dot(evecs.T, data_centered.T).T

    # randomize eignvectors
    new_evecs = evecs.copy().T
    for i in range(len(new_evecs)):
        np.random.shuffle(new_evecs[i])
    new_evecs = np.array(new_evecs).T

    # go back to the original dimension
    data_original_dimension = np.dot(data_transformed, new_evecs.T) 
    data_original_dimension += mean

    return data_original_dimension

Clusterization Algorithms


In [None]:
def find_clusters(X, k):   
    Kmean = KMeans(n_clusters=k)
    Kmean.fit(X)
    return Kmean.labels_

In [None]:
def anonimization_clustering(data, y, k):
    # generate K data clusters
    clusters = find_clusters(data, k)

    # bucketize the index of each cluster
    indices = dict()
    for i in range(len(clusters)):
        if clusters[i] not in indices.keys():
            indices[ clusters[i] ] = []    
        indices[ clusters[i] ].append(i)

    data_anonymized, y_in_new_order = None, None

    # anonymize each cluster individually
    for k in indices.keys():
        if data_anonymized is None and y_in_new_order is None:
            data_anonymized = anonimization(data[ indices[k] ])
            y_in_new_order = y[ indices[k] ]
            empty_flag = False
        else:
            data_anonymized = np.concatenate((data_anonymized, anonimization(data[ indices[k] ]) ), axis=0)
            y_in_new_order = np.concatenate((y_in_new_order, y[ indices[k] ]), axis=0)

    # data_anonymized = np.concatenate((data_anonymized, np.array([y_in_new_order]).T), axis=1)
    # print(pd.DataFrame(data_anonymized))

    return data_anonymized, y_in_new_order

Cross Validation

In [None]:
def cross_validate_k_fold(X, y, anon_training, anon_test, model, model_name, n_clusters):
    kf = StratifiedKFold(n_splits=3)
    scaler = StandardScaler()

    accuracy, precision, recall, f1 = [], [], [], []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        if anon_training == True:
            X_train, y_train = anonimization_clustering(X_train, y_train, n_clusters)

        if anon_test == True:
            X_test, y_test = anonimization_clustering(X_test, y_test, n_clusters)

        scaler.fit(X_train)
        scaler.fit(X_test)
            
        model.fit(X_train,y_train)

        y_pred = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))

    results = {'accuracy' : np.array(accuracy), 
           'precision' : np.array(precision),
           'recall' : np.array(recall), 
           'f1_score' : np.array(f1)}

    print(model_name, anon_training, anon_test)
    for k in results.keys():
        if k != 'fit_time' and k != 'score_time':
            print(k, '---> mean:', results[k].mean(), ' |  std:', results[k].std())

    return [ anon_training, anon_test, results['accuracy'].mean(), results['precision'].mean(), results['recall'].mean(), results['f1_score'].mean() ]

Read data

In [None]:
dataset = pd.read_csv('df_original_100000.csv')
# dataset = pd.DataFrame([[71, 29, 33, 1], [75, 19, 43, 1], [7, 9, 3, 1], [13, 21, 7, 0], [3, 2, 17, 1]])
# dataset.columns = ['A', 'B', 'C', 'Label']
# print(dataset)
# dataset = anonimization(dataset)

# print(dataset)

y = np.array(dataset['Label'])
del dataset['Label']
X = np.array(dataset)

print(X.shape)

In [None]:
def get_results(model, X, y, model_name, n_clusters):
    bol = [True, False]
    results = pd.DataFrame(columns=['anonymized train', 'anonymized test' , 'accuracy', 'precision', 'recall', 'f1_score'])
    for i in range(0, 2):
        for j in range(0, 2):
            new_df = pd.DataFrame([cross_validate_k_fold(X, y, bol[i], bol[j], model, model_name, n_clusters)], columns=results.columns)
            results = pd.concat([results, new_df], ignore_index=True)

    for i in range(2, len(results.columns)):
        col_name = results.columns[i]
        results[col_name] = results[col_name].apply(lambda row:"{:.3%}".format(float(row)))

    return results

In [None]:
# cross_validate_k_fold_knn(X, y, True, True)
# cross_validate_k_fold_knn(X, y, True, False)
# cross_validate_k_fold_knn(X, y, False, True)
# cross_validate_k_fold_knn(X, y, False, False)

In [None]:
results_knn = get_results(KNeighborsClassifier(n_neighbors=5), X, y, 'KNN', 3)
results_knn

In [None]:
results_dtree = get_results(DecisionTreeClassifier(), X, y, 'Decision Tree', 3)
results_dtree

In [None]:
results_rfc = get_results(RandomForestClassifier(n_estimators=100), X, y, 'Random Forest', 3)
results_rfc

In [None]:
results_gnb = get_results(GaussianNB(var_smoothing=1e-02), X, y, 'GaussianNB', 3)
results_gnb

In [None]:
results_knn.insert(0, "model", ['KNN', 'KNN', 'KNN', 'KNN'], True)
results_dtree.insert(0, "model", ['Decision Tree', 'Decision Tree', 'Decision Tree', 'Decision Tree'], True)
results_rfc.insert(0, "model", ['Random Forest', 'Random Forest', 'Random Forest', 'Random Forest'], True)
results_gnb.insert(0, "model", ['Gaussian NB', 'Gaussian NB', 'Gaussian NB', 'Gaussian NB'], True)


In [None]:
results = pd.DataFrame(columns=['model', 'anonymized train', 'anonymized test' , 'accuracy', 'precision', 'recall', 'f1_score'])

results = pd.concat([results, results_knn], ignore_index=True)
results = pd.concat([results, results_dtree], ignore_index=True)
results = pd.concat([results, results_rfc], ignore_index=True)
results = pd.concat([results, results_gnb], ignore_index=True)

In [None]:
results

In [None]:
results_acuracy = dict()
x = []



for k in tqdm(range(3, 105, 1)):
    knn_resul = get_results(KNeighborsClassifier(n_neighbors=k), X, y, 'KNN', k)
    dtree_resul = get_results(DecisionTreeClassifier(), X, y, 'Decision Tree', k)
    rfc_resul = get_results(RandomForestClassifier(n_estimators=100), X, y, 'Random Forest', k)
    gnb_resul = get_results(GaussianNB(var_smoothing=1e-02), X, y, 'GaussianNB', k)

    if 'KNN' not in results_acuracy.keys():
        results_acuracy['KNN'] = []
    if 'Decision Tree' not in results_acuracy.keys():
        results_acuracy['Decision Tree'] = []
    if 'Random Forest' not in results_acuracy.keys():
        results_acuracy['Random Forest'] = []
    if 'GaussianNB' not in results_acuracy.keys():
        results_acuracy['GaussianNB'] = []
    
    results_acuracy['KNN'].append(knn_resul.iloc[0, 2])
    results_acuracy['Decision Tree'].append(dtree_resul.iloc[0, 2])
    results_acuracy['Random Forest'].append(rfc_resul.iloc[0, 2])
    results_acuracy['GaussianNB'].append(gnb_resul.iloc[0, 2])

    x.append(k)
    


In [2]:
# from joblib import Parallel, delayed

# def process(i):
#     return [i * i, i * i]
    
# results = Parallel(n_jobs=2)(delayed(process)(i) for i in range(10))
# print(results)  # prints [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

[[0, 0], [1, 1], [4, 4], [9, 9], [16, 16], [25, 25], [36, 36], [49, 49], [64, 64], [81, 81]]


In [None]:
print(results_acuracy)

In [None]:
def plot_resul(results, x):
    # color = ['chartreuse', 'orange', 'firebrick', 'blue']
    color = {'KNN': 'chartreuse', 'GaussianNB': 'blue', 'Decision Tree': 'firebrick', 'Random Forest': 'orange'}
    # marker = ['^', '*', '.', 'o']
    marker = {'KNN': 'o', 'GaussianNB': '.', 'Decision Tree': '*', 'Random Forest': '^'}

    for key in results.keys():
        y = []
        for i in range(len(results[key])):
            y.append(float(results[key][i][0:len(results[key][i])-1]))
        print(key, y)
        plt.plot(x, y, color=color[key], marker=marker[key], label=key)

    plt.ylabel('Accuracy')
    plt.xlabel('K of K-Anonymity')
    plt.title('Results')
    plt.legend()
    plt.show()
    # plt.savefig('k_by_accuracy' + '.pdf')

plot_resul(results_acuracy, x)