In [64]:
import pandas as pd
import numpy as np

In [65]:
def mondrian_k_anonymity(df, k, sensitive_attribute):
    # Sort the dataframe by the sensitive attribute
    df = df.sort_values(by=sensitive_attribute)

    # Calculate the number of records in each partition
    partition_size = len(df) // k

    # Initialize a list to store the anonymized partitions
    partitions = []

    # Split the dataset into partitions
    for i in range(0, len(df), partition_size):
        partition = df.iloc[i:i+partition_size]
        partitions.append(partition)

    # Generalize the quasi-identifiers within each partition
    for partition in partitions:
        for column in df.columns:
            if column != sensitive_attribute:
                # Generalization strategy
                if df[column].dtype == 'int64':
                    partition[column] = round(partition[column].mean())
                else:
                    # For categorical attributes, select the most common value
                    mode_value = partition[column].mode().iloc[0]
                    partition[column] = mode_value

    return partitions


In [66]:
df = pd.read_csv('Datasets/darwin.csv')
df.head()

Unnamed: 0,ID,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,...,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25,class
0,id_1,5160,1.3e-05,120.804174,86.853334,957,6601,0.3618,0.217459,103.828754,...,0.141434,0.024471,5.596487,3.184589,71,40120,1749.278166,296102.7676,144605,P
1,id_2,51980,1.6e-05,115.318238,83.448681,1694,6998,0.272513,0.14488,99.383459,...,0.049663,0.018368,1.665973,0.950249,129,126700,1504.768272,278744.285,298640,P
2,id_3,2600,1e-05,229.933997,172.761858,2333,5802,0.38702,0.181342,201.347928,...,0.178194,0.017174,4.000781,2.392521,74,45480,1431.443492,144411.7055,79025,P
3,id_4,2130,1e-05,369.403342,183.193104,1756,8159,0.556879,0.164502,276.298223,...,0.113905,0.01986,4.206746,1.613522,123,67945,1465.843329,230184.7154,181220,P
4,id_5,2310,7e-06,257.997131,111.275889,987,4732,0.266077,0.145104,184.63651,...,0.121782,0.020872,3.319036,1.680629,92,37285,1841.702561,158290.0255,72575,P


In [67]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_data(dataframe):
    encoded_df = dataframe.copy()
    label_encoders = {}

    for column in encoded_df.columns:
        if encoded_df[column].dtype == 'object':
            label_encoder = LabelEncoder()
            encoded_df[column] = label_encoder.fit_transform(encoded_df[column])
            label_encoders[column] = label_encoder

    return encoded_df, label_encoders

encoded_df, encoders = encode_categorical_data(df)

In [68]:
from sklearn.model_selection import train_test_split

features_train,features_test,labels_train,labels_test = train_test_split(
    encoded_df,np.ones(len(encoded_df)),random_state=17,test_size=0.2
)
features_train.head()

Unnamed: 0,ID,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,...,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25,class
80,154,3490,1e-05,274.903867,250.472402,1073,7853,0.322473,0.227919,262.688134,...,0.205982,0.018896,4.806361,3.335828,60,35920,1929.296214,94373.02807,59175,1
127,32,1870,6e-06,595.572014,498.890817,2316,8467,1.043333,0.313464,547.231416,...,0.133741,0.020917,3.709496,2.166313,124,38330,1739.271067,264041.3716,77215,0
81,155,3235,8e-06,285.310405,306.741968,2710,5757,0.550117,0.237772,296.026187,...,0.205982,0.018896,4.806361,3.335828,60,35920,1929.296214,94373.02807,59175,1
38,107,2635,7e-06,261.166647,147.334293,2099,3936,0.620047,0.149459,204.25047,...,0.09191,0.022776,4.458446,3.211986,93,27320,1491.778917,235001.2465,50155,1
139,46,2975,9e-06,396.225929,153.49731,1628,7466,0.60264,0.148833,274.861619,...,0.375078,0.022148,8.964604,5.292562,68,27560,1718.044086,142453.2145,37840,0


In [69]:
features_test.head()

Unnamed: 0,ID,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,...,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25,class
124,29,5120,8e-06,444.560149,206.012058,1883,7998,2.311424,0.207807,325.286103,...,0.212966,0.019805,5.226698,2.923295,74,34915,1532.649434,165058.1907,59585,0
151,59,6690,1.2e-05,194.082327,245.322474,2289,9325,0.446294,0.180476,219.702401,...,0.142036,0.02057,4.583994,3.178104,79,38095,1775.257777,192209.318,68875,0
85,159,26050,2e-06,37.102751,113.153186,1319,561,0.180408,0.11772,75.127969,...,0.205982,0.018896,4.806361,3.335828,60,35920,1929.296214,94373.02807,59175,1
167,76,2039,1.3e-05,323.101817,206.158451,2703,10615,0.232597,0.161851,264.630134,...,0.16179,0.025171,6.093337,3.149393,108,35380,1706.291549,202363.7529,66470,0
122,27,900,7e-06,668.842045,279.907566,2690,6963,0.460393,0.182907,474.374806,...,0.140552,0.020247,4.012503,2.442364,58,33475,1866.19761,177863.2486,55795,0


In [70]:
k = 5
sensitive_attribute = df.columns[-1]

partitioned_data = mondrian_k_anonymity(encoded_df, k, sensitive_attribute)

for idx, partition in enumerate(partitioned_data):
    print(f"Partition {idx + 1} (Sensitive Value: {partition[sensitive_attribute].iloc[0]}):")
    print(partition,end='\n\n')

anonymized_df = pd.concat(partitioned_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partition[column] = mode_value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partition[column] = round(partition[column].mean())


Partition 1 (Sensitive Value: 0):
     ID  air_time1  disp_index1  gmrt_in_air1  gmrt_on_paper1  \
173   2       2479     0.000011    137.550578       97.210977   
111   2       2479     0.000011    137.550578       97.210977   
112   2       2479     0.000011    137.550578       97.210977   
113   2       2479     0.000011    137.550578       97.210977   
114   2       2479     0.000011    137.550578       97.210977   
115   2       2479     0.000011    137.550578       97.210977   
116   2       2479     0.000011    137.550578       97.210977   
117   2       2479     0.000011    137.550578       97.210977   
118   2       2479     0.000011    137.550578       97.210977   
119   2       2479     0.000011    137.550578       97.210977   
120   2       2479     0.000011    137.550578       97.210977   
121   2       2479     0.000011    137.550578       97.210977   
122   2       2479     0.000011    137.550578       97.210977   
123   2       2479     0.000011    137.550578       97.2

In [71]:
anonymized_df.head()

Unnamed: 0,ID,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,...,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25,class
173,2,2479,1.1e-05,137.550578,97.210977,1870,7351,0.191476,0.118069,142.936372,...,0.041426,0.01608,1.724092,1.420393,79,35497,1245.312029,84093.12944,92147,0
111,2,2479,1.1e-05,137.550578,97.210977,1870,7351,0.191476,0.118069,142.936372,...,0.041426,0.01608,1.724092,1.420393,79,35497,1245.312029,84093.12944,92147,0
112,2,2479,1.1e-05,137.550578,97.210977,1870,7351,0.191476,0.118069,142.936372,...,0.041426,0.01608,1.724092,1.420393,79,35497,1245.312029,84093.12944,92147,0
113,2,2479,1.1e-05,137.550578,97.210977,1870,7351,0.191476,0.118069,142.936372,...,0.041426,0.01608,1.724092,1.420393,79,35497,1245.312029,84093.12944,92147,0
114,2,2479,1.1e-05,137.550578,97.210977,1870,7351,0.191476,0.118069,142.936372,...,0.041426,0.01608,1.724092,1.420393,79,35497,1245.312029,84093.12944,92147,0


In [72]:
label = encoded_df.iloc[:,-1]
encoded_df = encoded_df.iloc[:,:-1]
encoded_df.head()

Unnamed: 0,ID,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,...,mean_gmrt25,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25
0,0,5160,1.3e-05,120.804174,86.853334,957,6601,0.3618,0.217459,103.828754,...,249.729085,0.141434,0.024471,5.596487,3.184589,71,40120,1749.278166,296102.7676,144605
1,86,51980,1.6e-05,115.318238,83.448681,1694,6998,0.272513,0.14488,99.383459,...,77.258394,0.049663,0.018368,1.665973,0.950249,129,126700,1504.768272,278744.285,298640
2,97,2600,1e-05,229.933997,172.761858,2333,5802,0.38702,0.181342,201.347928,...,193.667018,0.178194,0.017174,4.000781,2.392521,74,45480,1431.443492,144411.7055,79025
3,108,2130,1e-05,369.403342,183.193104,1756,8159,0.556879,0.164502,276.298223,...,163.065803,0.113905,0.01986,4.206746,1.613522,123,67945,1465.843329,230184.7154,181220
4,119,2310,7e-06,257.997131,111.275889,987,4732,0.266077,0.145104,184.63651,...,147.094679,0.121782,0.020872,3.319036,1.680629,92,37285,1841.702561,158290.0255,72575


In [73]:
label.head()

0    1
1    1
2    1
3    1
4    1
Name: class, dtype: int32

In [74]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(encoded_df,label)

In [75]:
test_feature = features_test.iloc[:,:-1]
test_label = features_test.iloc[:,-1]
test_feature.head()

Unnamed: 0,ID,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,...,mean_gmrt25,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25
124,29,5120,8e-06,444.560149,206.012058,1883,7998,2.311424,0.207807,325.286103,...,251.18636,0.212966,0.019805,5.226698,2.923295,74,34915,1532.649434,165058.1907,59585
151,59,6690,1.2e-05,194.082327,245.322474,2289,9325,0.446294,0.180476,219.702401,...,227.042337,0.142036,0.02057,4.583994,3.178104,79,38095,1775.257777,192209.318,68875
85,159,26050,2e-06,37.102751,113.153186,1319,561,0.180408,0.11772,75.127969,...,264.310776,0.205982,0.018896,4.806361,3.335828,60,35920,1929.296214,94373.02807,59175
167,76,2039,1.3e-05,323.101817,206.158451,2703,10615,0.232597,0.161851,264.630134,...,259.406572,0.16179,0.025171,6.093337,3.149393,108,35380,1706.291549,202363.7529,66470
122,27,900,7e-06,668.842045,279.907566,2690,6963,0.460393,0.182907,474.374806,...,186.790329,0.140552,0.020247,4.012503,2.442364,58,33475,1866.19761,177863.2486,55795


In [76]:
test_label.head()

124    0
151    0
85     1
167    0
122    0
Name: class, dtype: int32

In [77]:
pred = model.predict(test_feature)
pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1])

In [78]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true=test_label,y_pred=pred)
accuracy

0.8285714285714286