### Autor: Manuelle Ndamtang, 66 73 20 00

In [1]:
import pandas as pd
import numpy as np
import itertools
import math
from datetime import date
# for the zipcode generalization
!pip install -U uszipcode 
from uszipcode import SearchEngine

df = pd.read_csv("dataset_HW3.csv")
dff = df.copy()

#Check if k-anonymous
def is_k_anonymous(df, k):
    tmp = df.groupby(df.columns.to_list()).size()
    return all([y >= 2 for y in tmp])

#This Calculate the entropy of the dataset
def entropy(df):
    cols = df.columns.to_list()
    N = len(df)
    tmp = list(df.groupby(cols).size())
    return -sum([c/N * math.log(c/N, 2) for c in tmp])

# this function checks if the datset is l-diverse
def is_l_diverse(df, quasi_feats, sensitive, l):
    k_anonymous_class_counts = pd.DataFrame(df.groupby(quasi_feats).size())
    for i in range(k_anonymous_class_counts.shape[0]):
        cls = pd.DataFrame(k_anonymous_class_counts.iloc[[i]])
        tmp = pd.merge(df, cls, how="inner", indicator=True, on=quasi_feats) # get data for one class
        l_diversity_counts = tmp.groupby([sensitive]).size()
        if len(l_diversity_counts) < l:
            return False
    return True



In [2]:
dff['disease'].value_counts() # visualize the values in disease

gastritis              255
hypertension           251
heart disease          208
HIV/AIDS               159
skin cancer            155
diabetes               149
Alzheimer's disease    147
endometriosis          143
multiple sclerosis     133
kidney disease         131
schizophrenia          129
breast cancer           70
prostate cancer         70
Name: disease, dtype: int64

# Data pseudonymization

In [3]:
import hashlib
import os

salt = os.urandom(256) # Remember this

dff.id = dff.apply(lambda x: hashlib.pbkdf2_hmac(
                'sha256', # The hash digest algorithm for HMAC
                x.id.encode('utf-8') + ':'.encode('utf-8') + x.dob.encode('utf-8'), # add to the id the dob attributed in order to ensure the unicity
                salt, # Provide the salt
                1#00000 # It is recommended to use at least 100,000 iterations of SHA-256 but for the seek of computation we only run once
            ), 1)

print(f'the ids are unique: { len(dff["id"]) == dff["id"].nunique()}')# check if the ids are unique
dff.head()

the ids are unique: True


Unnamed: 0,id,gender,dob,zipcode,education,employment,children,marital_status,ancestry,number_vehicles,commute_time,accommodation,disease
0,b'L\x1b\xef\x80\x0e\xc3\xb0\x1f\x90\xb4\xbc\x0...,female,12/1950,26904,High School,Retired,2,married,Africa,2,0.0,Own house,Alzheimer's disease
1,b'\x98i`\x1d\xe1U\x95\xb8 X\xc7b\xad(\x08\x81D...,female,4/1940,78088,PhD/md,Retired,1,married,Africa,1,0.0,Rent flat,multiple sclerosis
2,b'n\xe18!\x90\\\xe5v\x0eiO\xdf\xdf\x05\x0bvD\x...,female,12/1966,51502,Bachelor,Employed,1,married,Asia,0,0.1,Rent flat,heart disease
3,b'f\x0c\xa5\x9a\x1d\x83<\xb70\xaa?\x83\xdb\x00...,female,4/1943,54080,Bachelor,Retired,1,married,Africa,0,0.0,Rent room,endometriosis
4,b'\x12\xd8\xcaT\xcdn\xbd\x13\xa8\\\xd2J\xed\x9...,female,10/1947,68785,High School,Retired,1,married,Europe,0,0.0,Rent flat,gastritis


# Selection of interesting features for this use case

In [4]:
feats = ["id", "gender", "zipcode", "dob", "employment", "children", "commute_time", "disease"] # The chosen features
dff = dff[feats]
dff.head()

Unnamed: 0,id,gender,zipcode,dob,employment,children,commute_time,disease
0,b'L\x1b\xef\x80\x0e\xc3\xb0\x1f\x90\xb4\xbc\x0...,female,26904,12/1950,Retired,2,0.0,Alzheimer's disease
1,b'\x98i`\x1d\xe1U\x95\xb8 X\xc7b\xad(\x08\x81D...,female,78088,4/1940,Retired,1,0.0,multiple sclerosis
2,b'n\xe18!\x90\\\xe5v\x0eiO\xdf\xdf\x05\x0bvD\x...,female,51502,12/1966,Employed,1,0.1,heart disease
3,b'f\x0c\xa5\x9a\x1d\x83<\xb70\xaa?\x83\xdb\x00...,female,54080,4/1943,Retired,1,0.0,endometriosis
4,b'\x12\xd8\xcaT\xcdn\xbd\x13\xa8\\\xd2J\xed\x9...,female,68785,10/1947,Retired,1,0.0,gastritis


# Generalization of the dataset

## Generalize zipcode

In [5]:
print(f'Is the dataset with only zipcode as quasi-identifier, 2-anonymous: {is_k_anonymous(dff[["zipcode"]], 2)}')# check the 2-anonymosity for this attribute

Is the dataset with only zipcode as quasi-identifier, 2-anonymous: False


In [6]:
search = SearchEngine(simple_zipcode=True)

state = [search.by_zipcode(zipcode).to_dict()["state"] for zipcode in dff['zipcode']]
dff['state'] = state
dff.drop('zipcode', axis=1, inplace=True)
dff['state'].value_counts() # check the 2-anonymosity for this attribute

TX    63
CA    57
PA    52
NY    52
IL    37
FL    32
OH    31
IA    28
MN    27
NC    26
MI    26
AL    24
VA    24
IN    23
MO    23
WV    22
AR    20
KS    20
OK    19
CO    17
GA    17
KY    16
MS    15
WI    15
AZ    15
MT    14
WA    14
LA    14
NE    14
OR    12
TN    11
SC    11
NM     9
AK     9
UT     8
MD     7
HI     7
SD     7
DC     6
NV     5
ND     5
ID     5
WY     3
DE     1
Name: state, dtype: int64

## Generalize the children attribute

In [7]:
dff['children'] = dff['children'].apply(lambda x: True if x > 0 else False)
dff.head()

Unnamed: 0,id,gender,dob,employment,children,commute_time,disease,state
0,b'L\x1b\xef\x80\x0e\xc3\xb0\x1f\x90\xb4\xbc\x0...,female,12/1950,Retired,True,0.0,Alzheimer's disease,
1,b'\x98i`\x1d\xe1U\x95\xb8 X\xc7b\xad(\x08\x81D...,female,4/1940,Retired,True,0.0,multiple sclerosis,
2,b'n\xe18!\x90\\\xe5v\x0eiO\xdf\xdf\x05\x0bvD\x...,female,12/1966,Employed,True,0.1,heart disease,IA
3,b'f\x0c\xa5\x9a\x1d\x83<\xb70\xaa?\x83\xdb\x00...,female,4/1943,Retired,True,0.0,endometriosis,
4,b'\x12\xd8\xcaT\xcdn\xbd\x13\xa8\\\xd2J\xed\x9...,female,10/1947,Retired,True,0.0,gastritis,NE


## Generalize the dob attribute to use group of age instead

In [8]:
# Function to calculate the age
def get_age(m):
    today = date.today()
    return today.year - m.year - (today.month < m.month)

dff['dob'] = pd.to_datetime(df['dob'])
dff["age"] = dff["dob"].apply(get_age)
dff["age_group"] = list(pd.cut(dff["age"], bins=[0, 13, 19, 61, 100], labels=["Child", "Teen", "Adult", "Older"], include_lowest=False))
dff.drop(['age', 'dob'], axis=1, inplace=True)

# distribution of the year attribute over the dataset
print(f'The age_group attribute is 2 anonymized: {is_k_anonymous(dff[["age_group"]], 2)}')
dff.head()

The age_group attribute is 2 anonymized: True


Unnamed: 0,id,gender,employment,children,commute_time,disease,state,age_group
0,b'L\x1b\xef\x80\x0e\xc3\xb0\x1f\x90\xb4\xbc\x0...,female,Retired,True,0.0,Alzheimer's disease,,Older
1,b'\x98i`\x1d\xe1U\x95\xb8 X\xc7b\xad(\x08\x81D...,female,Retired,True,0.0,multiple sclerosis,,Older
2,b'n\xe18!\x90\\\xe5v\x0eiO\xdf\xdf\x05\x0bvD\x...,female,Employed,True,0.1,heart disease,IA,Adult
3,b'f\x0c\xa5\x9a\x1d\x83<\xb70\xaa?\x83\xdb\x00...,female,Retired,True,0.0,endometriosis,,Older
4,b'\x12\xd8\xcaT\xcdn\xbd\x13\xa8\\\xd2J\xed\x9...,female,Retired,True,0.0,gastritis,NE,Older


## Choose the set of features with the best entropy

In [9]:
#the quasi-identifier used for the calculation of the entropy
# state in not present because the measure doesn't concern it. At the end, we want to have the state even if we don't get the best entropy with it
feats = ["age_group", "dob", "zipcode", "gender", "employment", "children", "commute_time"]
init_entropy = entropy(df[feats[2:]])
print(f'The initial entropy: {init_entropy}')

The initial entropy: 10.965784284661922


In [10]:
feats.pop(2) # remove the 'zipcode' attributed not used for the transformed dataset
feats.pop(1) # remove the 'dob' attributed not used for the transformed dataset

# we are trying to find the set of features with the best entropy
choice = {}
choice["entropy"] = 0

for L in range(2, len(feats)+1):
    for subset in itertools.combinations(feats, L):
        if not subset: continue
        e = entropy(dff[list(subset)])
        if e >= choice["entropy"]:
            min_entropy = e
            choice["subset"] = list(subset)
            choice["nb_feats"] = len(subset)
            choice["entropy"] = e
            print(f'The subset are {subset}')
            print(f'The entropy for these features: {choice["nb_feats"]},\nThe number of features: {e}')

#The best set of features
print(f'\n\n******The best subset is {choice["subset"]}')
print(f'******The best entropy for these features: {choice["entropy"]},\n******The number of features: {choice["nb_feats"]}')


The subset are ('age_group', 'gender')
The entropy for these features: 2,
The number of features: 2.0366285399013697
The subset are ('age_group', 'employment')
The entropy for these features: 2,
The number of features: 2.085147025670713
The subset are ('age_group', 'commute_time')
The entropy for these features: 2,
The number of features: 3.3806215958469816
The subset are ('gender', 'commute_time')
The entropy for these features: 2,
The number of features: 3.619512967758485
The subset are ('age_group', 'gender', 'commute_time')
The entropy for these features: 3,
The number of features: 4.36600437481468
The subset are ('gender', 'employment', 'commute_time')
The entropy for these features: 3,
The number of features: 4.51352171448308
The subset are ('age_group', 'gender', 'employment', 'commute_time')
The entropy for these features: 4,
The number of features: 4.958186310508735
The subset are ('age_group', 'gender', 'employment', 'children', 'commute_time')
The entropy for these features:

In [11]:
choice["subset"].append("state") # we add state
# the set of dataset with the best entropy
copy_feats = choice["subset"].copy()
copy_feats.append("disease") # add sensitive data
copy_feats.insert(0, "id") # add the identifier
copy_dff = dff[copy_feats] # copy th set of data interesting for the use case
copy_dff.head()

Unnamed: 0,id,age_group,gender,employment,children,commute_time,state,disease
0,b'L\x1b\xef\x80\x0e\xc3\xb0\x1f\x90\xb4\xbc\x0...,Older,female,Retired,True,0.0,,Alzheimer's disease
1,b'\x98i`\x1d\xe1U\x95\xb8 X\xc7b\xad(\x08\x81D...,Older,female,Retired,True,0.0,,multiple sclerosis
2,b'n\xe18!\x90\\\xe5v\x0eiO\xdf\xdf\x05\x0bvD\x...,Adult,female,Employed,True,0.1,IA,heart disease
3,b'f\x0c\xa5\x9a\x1d\x83<\xb70\xaa?\x83\xdb\x00...,Older,female,Retired,True,0.0,,endometriosis
4,b'\x12\xd8\xcaT\xcdn\xbd\x13\xa8\\\xd2J\xed\x9...,Older,female,Retired,True,0.0,NE,gastritis


# check the k-diversity

In [12]:
# check if the dataset is 2-anonymous
print(f'The dataset is 2-anonymous: {is_k_anonymous(copy_dff[choice["subset"]], 2)}')

The dataset is 2-anonymous: False


In [13]:
# select classes which are not 2-anonymous and delete them
tmp = pd.DataFrame(dff.groupby(choice["subset"]).size().reset_index(name='counts'))
# The data to delete in order to have a 2-anonymous dataset
to_exclude = tmp.drop(tmp[tmp.counts >= 2].index)
to_exclude

Unnamed: 0,age_group,gender,employment,children,commute_time,state,counts
0,Adult,female,Employed,False,0.00,FL,1
1,Adult,female,Employed,False,0.43,IL,1
2,Adult,female,Employed,False,0.43,NE,1
3,Adult,female,Employed,False,1.10,GA,1
4,Adult,female,Employed,False,1.10,KY,1
...,...,...,...,...,...,...,...
551,Teen,female,Student,False,1.10,TX,1
552,Teen,male,Student,False,0.10,WA,1
553,Teen,male,Student,False,0.43,DC,1
554,Teen,male,Student,False,0.43,MD,1


In [14]:
copy_dff = pd.merge(copy_dff, to_exclude, how="outer", indicator=True, on=choice["subset"])
copy_dff = copy_dff.loc[copy_dff._merge == 'left_only'].drop(["_merge", "counts"], axis=1)
copy_dff.head()

Unnamed: 0,id,age_group,gender,employment,children,commute_time,state,disease
0,b'L\x1b\xef\x80\x0e\xc3\xb0\x1f\x90\xb4\xbc\x0...,Older,female,Retired,True,0.0,,Alzheimer's disease
1,b'\x98i`\x1d\xe1U\x95\xb8 X\xc7b\xad(\x08\x81D...,Older,female,Retired,True,0.0,,multiple sclerosis
2,b'f\x0c\xa5\x9a\x1d\x83<\xb70\xaa?\x83\xdb\x00...,Older,female,Retired,True,0.0,,endometriosis
3,"b'\x81\xf6\xfb\xf4\xf8\xcb,\xbe9?E|\xc1\xc7\xf...",Older,female,Retired,True,0.0,,multiple sclerosis
4,"b'\x86\x81L`\xda\xddm>\xe3@\xceI""J[\xa0?\x8f\x...",Older,female,Retired,True,0.0,,Alzheimer's disease


In [15]:
# Now we check if we are 2-anonymous with the new dataset
print(f'The new dataset is 2-anonymous: {is_k_anonymous(copy_dff[choice["subset"]], 2)}')

The new dataset is 2-anonymous: True


In [16]:
# The size of the new dataset
copy_dff.shape

(1574, 8)

# Check the l-diversity

In [17]:
sensitive_feat = "disease"
print(f'Is the dataset 2-diverse: {is_l_diverse(copy_dff, choice["subset"], sensitive_feat, 2)}')

Is the dataset 2-diverse: False


In [18]:
l_df = copy_dff.copy()

# delete classes that are not 2-diverse
k_anonymous_class_counts = pd.DataFrame(copy_dff.groupby(choice["subset"]).size().reset_index(name='counts'))
for i in range(k_anonymous_class_counts.shape[0]):
    quasi_feats = choice["subset"].copy()
    cls = pd.DataFrame(k_anonymous_class_counts.iloc[[i]])
    tmp = pd.merge(copy_dff, cls, how="inner", indicator=True, on=quasi_feats) # get data for one class
    l_diversity_counts = tmp.groupby([sensitive_feat]).size() # get sub_classes for each sensitive data
    if len(l_diversity_counts) < 2: # check if 2-diverse, else update the dataframe by deleting classes which are not 2-diverse
        l_df = pd.merge(l_df, cls, how="outer", indicator=True, on=choice["subset"])
        l_df = l_df.loc[l_df._merge == 'left_only'].drop(["_merge", "counts"], axis=1) 
    
print(f'Is the dataset 2-diverse: {is_l_diverse(l_df, choice["subset"], sensitive_feat, 2)}')
print(f'The new dataset is 2-anonymous: {is_k_anonymous(l_df[choice["subset"]], 2)}')

Is the dataset 2-diverse: True
The new dataset is 2-anonymous: True


In [19]:
#The sharpe of the new dataset
l_df.shape

(1564, 8)

# Save the anonymized dataset

In [20]:
l_df.to_csv("use_case2.csv")