In [45]:
from ucimlrepo import fetch_ucirepo

cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

print(cdc_diabetes_health_indicators.metadata)

print(cdc_diabetes_health_indicators.variables) 

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Life Science', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_data

In [46]:
import pandas as pd
import numpy as np

In [47]:
def mondrian_k_anonymity(df, k, sensitive_attribute):
    # Sort the dataframe by the sensitive attribute
    df = df.sort_values(by=sensitive_attribute)

    # Calculate the number of records in each partition
    partition_size = len(df) // k

    # Initialize a list to store the anonymized partitions
    partitions = []

    # Split the dataset into partitions
    for i in range(0, len(df), partition_size):
        partition = df.iloc[i:i+partition_size]
        partitions.append(partition)

    # Generalize the quasi-identifiers within each partition
    for partition in partitions:
        for column in df.columns:
            if column != sensitive_attribute:
                # Generalization strategy
                if df[column].dtype == 'int64':
                    partition[column] = round(partition[column].mean())
                else:
                    # For categorical attributes, select the most common value
                    mode_value = partition[column].mode().iloc[0]
                    partition[column] = mode_value

    return partitions


In [48]:
encoded_df = X.copy()
encoded_df['Diabetes'] = y
encoded_df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


In [49]:
from sklearn.model_selection import train_test_split

features_train,features_test,labels_train,labels_test = train_test_split(
    encoded_df,np.ones(len(encoded_df)),random_state=17,test_size=0.2
)
features_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes
213499,1,1,1,25,1,0,1,0,0,0,...,0,3,0,0,1,1,8,2,2,0
22469,1,0,1,21,0,0,0,1,1,1,...,0,2,0,0,0,0,12,4,7,0
187030,1,0,1,23,1,0,0,1,1,1,...,1,2,30,14,0,1,2,6,6,0
59104,0,0,1,20,1,0,0,0,0,1,...,0,3,10,2,0,1,2,4,8,0
250454,1,1,1,27,1,0,0,1,0,1,...,0,1,0,3,0,0,13,5,5,0


In [50]:
features_test.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes
81110,1,1,1,37,0,0,0,1,0,1,...,0,3,0,0,0,1,5,5,7,0
199303,0,0,1,26,0,0,0,0,0,1,...,0,2,0,0,0,1,6,4,6,0
217732,1,1,1,30,1,1,0,0,1,1,...,0,2,0,30,1,0,10,6,5,0
123241,1,0,1,27,0,0,0,1,1,1,...,1,3,0,0,0,0,5,4,6,0
114291,0,0,1,22,0,0,0,1,0,1,...,0,2,30,0,0,0,9,6,8,0


In [51]:
k = 5
sensitive_attribute = encoded_df.columns[-1]

partitioned_data = mondrian_k_anonymity(encoded_df, k, sensitive_attribute)

for idx, partition in enumerate(partitioned_data):
    print(f"Partition {idx + 1} (Sensitive Value: {partition[sensitive_attribute].iloc[0]}):")
    print(partition,end='\n\n')

anonymized_df = pd.concat(partitioned_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partition[column] = round(partition[column].mean())


Partition 1 (Sensitive Value: 0):
        HighBP  HighChol  CholCheck  BMI  Smoker  Stroke  \
0            0         0          1   28       0       0   
162451       0         0          1   28       0       0   
162452       0         0          1   28       0       0   
162453       0         0          1   28       0       0   
162454       0         0          1   28       0       0   
...        ...       ...        ...  ...     ...     ...   
148840       0         0          1   28       0       0   
148841       0         0          1   28       0       0   
148842       0         0          1   28       0       0   
148843       0         0          1   28       0       0   
148844       0         0          1   28       0       0   

        HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  ...  NoDocbcCost  \
0                          0             1       1        1  ...            0   
162451                     0             1       1        1  ...            0   
16

In [52]:
anonymized_df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes
0,0,0,1,28,0,0,0,1,1,1,...,0,2,3,4,0,0,8,5,6,0
162451,0,0,1,28,0,0,0,1,1,1,...,0,2,3,4,0,0,8,5,6,0
162452,0,0,1,28,0,0,0,1,1,1,...,0,2,3,4,0,0,8,5,6,0
162453,0,0,1,28,0,0,0,1,1,1,...,0,2,3,4,0,0,8,5,6,0
162454,0,0,1,28,0,0,0,1,1,1,...,0,2,3,4,0,0,8,5,6,0


In [53]:
label = encoded_df.iloc[:,-1]
encoded_df = encoded_df.iloc[:,:-1]
encoded_df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1,1,1,40,1,0,0,0,0,1,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,25,1,0,0,1,0,0,...,0,1,3,0,0,0,0,7,6,1
2,1,1,1,28,0,0,0,0,1,0,...,1,1,5,30,30,1,0,9,4,8
3,1,0,1,27,0,0,0,1,1,1,...,1,0,2,0,0,0,0,11,3,6
4,1,1,1,24,0,0,0,1,1,1,...,1,0,2,3,0,0,0,11,5,4


In [54]:
label.head()

0    0
1    0
2    0
3    0
4    0
Name: Diabetes, dtype: int64

In [55]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(encoded_df,label)

In [56]:
test_feature = features_test.iloc[:,:-1]
test_label = features_test.iloc[:,-1]
test_feature.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
81110,1,1,1,37,0,0,0,1,0,1,...,1,0,3,0,0,0,1,5,5,7
199303,0,0,1,26,0,0,0,0,0,1,...,1,0,2,0,0,0,1,6,4,6
217732,1,1,1,30,1,1,0,0,1,1,...,1,0,2,0,30,1,0,10,6,5
123241,1,0,1,27,0,0,0,1,1,1,...,0,1,3,0,0,0,0,5,4,6
114291,0,0,1,22,0,0,0,1,0,1,...,1,0,2,30,0,0,0,9,6,8


In [57]:
test_label.head()

81110     0
199303    0
217732    0
123241    0
114291    0
Name: Diabetes, dtype: int64

In [59]:
pred = model.predict(test_feature.iloc[0:1000])

In [60]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true=test_label.iloc[0:1000],y_pred=pred)
accuracy

0.906