In [345]:
import pandas as pd
import numpy as np

In [346]:
def mondrian_k_anonymity(df, k, sensitive_attribute):
    # Sort the dataframe by the sensitive attribute
    df = df.sort_values(by=sensitive_attribute)

    # Calculate the number of records in each partition
    partition_size = len(df) // k

    # Initialize a list to store the anonymized partitions
    partitions = []

    # Split the dataset into partitions
    for i in range(0, len(df), partition_size):
        partition = df.iloc[i:i+partition_size]
        partitions.append(partition)

    # Generalize the quasi-identifiers within each partition
    for partition in partitions:
        for column in df.columns:
            if column != sensitive_attribute:
                # Generalization strategy
                if df[column].dtype == 'int64':
                    partition[column] = round(partition[column].mean())
                else:
                    # For categorical attributes, select the most common value
                    mode_value = partition[column].mode().iloc[0]
                    partition[column] = mode_value

    return partitions


In [347]:
df = pd.read_csv('Datasets/ObesityDataSet_raw_and_data_sinthetic.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [348]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_data(dataframe):
    encoded_df = dataframe.copy()
    label_encoders = {}

    for column in encoded_df.columns:
        if encoded_df[column].dtype == 'object':
            label_encoder = LabelEncoder()
            encoded_df[column] = label_encoder.fit_transform(encoded_df[column])
            label_encoders[column] = label_encoder

    return encoded_df, label_encoders

encoded_df, encoders = encode_categorical_data(df)

In [349]:
from sklearn.model_selection import train_test_split

features_train,features_test,labels_train,labels_test = train_test_split(
    encoded_df,np.ones(len(encoded_df)),random_state=17,test_size=0.2
)
features_train.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
1685,1,25.015173,1.788239,115.382519,1,1,1.735664,3.0,2,0,2.041536,0,1.392406,0.39174,2,3,3
1790,1,23.147644,1.815514,120.337664,1,1,2.996717,2.791366,2,0,2.626309,0,1.194898,0.034897,2,3,3
1102,1,17.894784,1.731389,84.064875,1,0,2.019674,2.843319,2,0,2.832004,0,1.0,0.608607,2,3,6
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,1
1351,0,39.213399,1.586301,80.0,1,1,2.020502,1.237454,2,0,1.93142,0,1.967973,0.0,2,0,2


In [350]:
features_test.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
885,0,18.871917,1.755254,80.0,1,1,2.0,1.095223,2,0,2.474132,0,2.876696,0.7939,3,3,5
712,0,19.054938,1.585886,42.541794,0,0,2.910345,3.0,1,0,1.0,1,1.461005,0.0,2,3,0
62,1,22.0,1.67,62.0,0,1,2.0,1.0,3,0,2.0,0,0.0,0.0,2,3,1
1208,0,45.821267,1.687326,80.413997,1,1,2.076689,3.0,2,0,1.026729,0,0.647798,0.0,3,0,6
690,1,17.521754,1.757958,52.09432,0,1,2.21498,2.64155,2,0,2.121251,0,0.998391,0.85882,2,3,0


In [351]:
k = 5
sensitive_attribute = df.columns[-1]

partitioned_data = mondrian_k_anonymity(encoded_df, k, sensitive_attribute)

for idx, partition in enumerate(partitioned_data):
    print(f"Partition {idx + 1} (Sensitive Value: {partition[sensitive_attribute].iloc[0]}):")
    print(partition,end='\n\n')

anonymized_df = pd.concat(partitioned_data)

Partition 1 (Sensitive Value: 0):
     Gender   Age  Height  Weight  family_history_with_overweight  FAVC  FCVC  \
529       0  18.0     1.7    50.0                               0     1   3.0   
582       0  18.0     1.7    50.0                               0     1   3.0   
581       0  18.0     1.7    50.0                               0     1   3.0   
580       0  18.0     1.7    50.0                               0     1   3.0   
579       0  18.0     1.7    50.0                               0     1   3.0   
..      ...   ...     ...     ...                             ...   ...   ...   
208       0  18.0     1.7    50.0                               0     1   3.0   
207       0  18.0     1.7    50.0                               0     1   3.0   
204       0  18.0     1.7    50.0                               0     1   3.0   
199       0  18.0     1.7    50.0                               0     1   3.0   
196       0  18.0     1.7    50.0                               0     1   3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partition[column] = mode_value


In [352]:
anonymized_df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
529,0,18.0,1.7,50.0,0,1,3.0,3.0,2,0,2.0,0,2.0,1.0,2,3,0
582,0,18.0,1.7,50.0,0,1,3.0,3.0,2,0,2.0,0,2.0,1.0,2,3,0
581,0,18.0,1.7,50.0,0,1,3.0,3.0,2,0,2.0,0,2.0,1.0,2,3,0
580,0,18.0,1.7,50.0,0,1,3.0,3.0,2,0,2.0,0,2.0,1.0,2,3,0
579,0,18.0,1.7,50.0,0,1,3.0,3.0,2,0,2.0,0,2.0,1.0,2,3,0


In [353]:
label = encoded_df.iloc[:,-1]
encoded_df = encoded_df.iloc[:,:-1]
encoded_df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3
1,0,21.0,1.52,56.0,1,0,3.0,3.0,2,1,3.0,1,3.0,0.0,2,3
2,1,23.0,1.8,77.0,1,0,2.0,3.0,2,0,2.0,0,2.0,1.0,1,3
3,1,27.0,1.8,87.0,0,0,3.0,3.0,2,0,2.0,0,2.0,0.0,1,4
4,1,22.0,1.78,89.8,0,0,2.0,1.0,2,0,2.0,0,0.0,0.0,2,3


In [354]:
label.head()

0    1
1    1
2    1
3    5
4    6
Name: NObeyesdad, dtype: int32

In [355]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)
model.fit(encoded_df,label)

In [356]:
test_feature = features_test.iloc[:,:-1]
test_label = features_test.iloc[:,-1]
test_feature.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
885,0,18.871917,1.755254,80.0,1,1,2.0,1.095223,2,0,2.474132,0,2.876696,0.7939,3,3
712,0,19.054938,1.585886,42.541794,0,0,2.910345,3.0,1,0,1.0,1,1.461005,0.0,2,3
62,1,22.0,1.67,62.0,0,1,2.0,1.0,3,0,2.0,0,0.0,0.0,2,3
1208,0,45.821267,1.687326,80.413997,1,1,2.076689,3.0,2,0,1.026729,0,0.647798,0.0,3,0
690,1,17.521754,1.757958,52.09432,0,1,2.21498,2.64155,2,0,2.121251,0,0.998391,0.85882,2,3


In [357]:
test_label.head()

885     5
712     0
62      1
1208    6
690     0
Name: NObeyesdad, dtype: int32

In [358]:
pred = model.predict(test_feature)
pred

array([5, 0, 1, 6, 0, 4, 4, 1, 2, 2, 0, 5, 1, 0, 3, 2, 1, 5, 3, 6, 4, 1,
       6, 6, 1, 5, 5, 3, 2, 2, 5, 3, 0, 2, 2, 3, 3, 0, 0, 5, 0, 1, 0, 2,
       4, 6, 0, 1, 2, 1, 6, 5, 6, 3, 0, 0, 2, 2, 2, 0, 1, 6, 1, 4, 2, 4,
       6, 4, 0, 0, 4, 4, 6, 3, 2, 3, 3, 5, 4, 1, 1, 4, 5, 5, 6, 2, 2, 6,
       1, 0, 3, 2, 5, 3, 2, 2, 0, 0, 5, 2, 6, 2, 5, 0, 5, 5, 4, 6, 1, 0,
       3, 0, 1, 0, 1, 1, 5, 0, 0, 1, 3, 3, 6, 1, 6, 4, 6, 0, 3, 1, 1, 2,
       4, 2, 2, 3, 1, 2, 2, 3, 0, 3, 2, 2, 6, 0, 0, 6, 3, 4, 2, 5, 5, 3,
       6, 6, 0, 0, 6, 0, 5, 5, 6, 6, 0, 0, 4, 4, 3, 3, 2, 4, 3, 2, 3, 2,
       5, 1, 6, 5, 0, 4, 5, 1, 3, 5, 2, 6, 6, 6, 4, 3, 1, 3, 0, 6, 5, 2,
       2, 3, 2, 0, 0, 5, 2, 4, 3, 2, 4, 5, 2, 4, 3, 2, 6, 0, 1, 2, 2, 4,
       4, 6, 1, 4, 2, 4, 5, 4, 2, 4, 0, 6, 1, 5, 1, 3, 5, 1, 1, 5, 2, 5,
       5, 0, 3, 4, 2, 6, 2, 3, 3, 4, 4, 6, 0, 6, 5, 3, 4, 5, 3, 3, 2, 6,
       1, 5, 3, 2, 4, 6, 6, 5, 6, 3, 0, 4, 6, 5, 0, 3, 2, 5, 4, 4, 5, 0,
       0, 3, 6, 2, 0, 2, 3, 6, 5, 2, 4, 4, 4, 4, 0,

In [359]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true=test_label,y_pred=pred)
accuracy

0.9314420803782506