In [1]:
import pandas as pd
import numpy as np
import itertools
import math
from datetime import date
# for the zipcode generalization
!pip install -U uszipcode 
from uszipcode import SearchEngine

df = pd.read_csv("dataset_HW3.csv")
dff = df.copy()

#Check if k-anonymous
def is_k_anonymous(df, k):
    tmp = df.groupby(df.columns.to_list()).size()
    return all([y >= 2 for y in tmp])

#This Calculate the entropy of the dataset
def entropy(df):
    cols = df.columns.to_list()
    N = len(df)
    tmp = list(df.groupby(cols).size())
    return -sum([c/N * math.log(c/N, 2) for c in tmp])

# this function checks if the datset is l-diverse
def is_l_diverse(df, quasi_feats, sensitive, l):
    k_anonymous_class_counts = pd.DataFrame(df.groupby(quasi_feats).size())
    for i in range(k_anonymous_class_counts.shape[0]):
        cls = pd.DataFrame(k_anonymous_class_counts.iloc[[i]])
        tmp = pd.merge(df, cls, how="inner", indicator=True, on=quasi_feats) # get data for one class
        l_diversity_counts = tmp.groupby([sensitive]).size()
        if len(l_diversity_counts) < l:
            return False
    return True



# Data pseudonymization

In [2]:
import hashlib
import os

salt = os.urandom(256) # Remember this

dff.id = dff.apply(lambda x: hashlib.pbkdf2_hmac(
                'sha256', # The hash digest algorithm for HMAC
                x.id.encode('utf-8') + ':'.encode('utf-8') + x.dob.encode('utf-8'), # add to the id the dob attributed in order to ensure the unicity
                salt, # Provide the salt
                1#00000 # It is recommended to use at least 100,000 iterations of SHA-256 but for the seek of computation we only run once
            ), 1)

print(f'the ids are unique: { len(dff["id"]) == dff["id"].nunique()}')# check if the ids are unique

the ids are unique: True


In [3]:
dff.head()

Unnamed: 0,id,gender,dob,zipcode,education,employment,children,marital_status,ancestry,number_vehicles,commute_time,accommodation,disease
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,female,12/1950,26904,High School,Retired,2,married,Africa,2,0.0,Own house,Alzheimer's disease
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,female,4/1940,78088,PhD/md,Retired,1,married,Africa,1,0.0,Rent flat,multiple sclerosis
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,female,12/1966,51502,Bachelor,Employed,1,married,Asia,0,0.1,Rent flat,heart disease
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,female,4/1943,54080,Bachelor,Retired,1,married,Africa,0,0.0,Rent room,endometriosis
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,female,10/1947,68785,High School,Retired,1,married,Europe,0,0.0,Rent flat,gastritis


# Generalization in order to anonymized the dataset

## Delete the unecessary attributes

In [4]:
dff.drop(['ancestry'], axis=1, inplace=True)
dff.head()

Unnamed: 0,id,gender,dob,zipcode,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,female,12/1950,26904,High School,Retired,2,married,2,0.0,Own house,Alzheimer's disease
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,female,4/1940,78088,PhD/md,Retired,1,married,1,0.0,Rent flat,multiple sclerosis
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,female,12/1966,51502,Bachelor,Employed,1,married,0,0.1,Rent flat,heart disease
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,female,4/1943,54080,Bachelor,Retired,1,married,0,0.0,Rent room,endometriosis
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,female,10/1947,68785,High School,Retired,1,married,0,0.0,Rent flat,gastritis


## Generalize the children attribute

In [5]:
dff['children'] = dff['children'].apply(lambda x: True if x > 0 else False)
dff.head()

Unnamed: 0,id,gender,dob,zipcode,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,female,12/1950,26904,High School,Retired,True,married,2,0.0,Own house,Alzheimer's disease
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,female,4/1940,78088,PhD/md,Retired,True,married,1,0.0,Rent flat,multiple sclerosis
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,female,12/1966,51502,Bachelor,Employed,True,married,0,0.1,Rent flat,heart disease
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,female,4/1943,54080,Bachelor,Retired,True,married,0,0.0,Rent room,endometriosis
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,female,10/1947,68785,High School,Retired,True,married,0,0.0,Rent flat,gastritis


## Generalize the dob attribute to use group of age instead

In [6]:
# Function to calculate the age
def get_age(m):
    today = date.today()
    return today.year - m.year - (today.month < m.month)

In [7]:
dff['dob'] = pd.to_datetime(df['dob'])
dff["age"] = dff["dob"].apply(get_age)
dff["age_group"] = list(pd.cut(dff["age"], bins=[0, 13, 19, 61, 100], labels=["Child", "Teen", "Adult", "Older"], include_lowest=False))
dff.drop(['age', 'dob'], axis=1, inplace=True)

# distribution of the year attribute over the dataset
print(f'The age_group attribute is 2 anonymized: {is_k_anonymous(dff[["age_group"]], 2)}')
dff.head()

The age_group attribute is 2 anonymized: True


Unnamed: 0,id,gender,zipcode,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease,age_group
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,female,26904,High School,Retired,True,married,2,0.0,Own house,Alzheimer's disease,Older
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,female,78088,PhD/md,Retired,True,married,1,0.0,Rent flat,multiple sclerosis,Older
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,female,51502,Bachelor,Employed,True,married,0,0.1,Rent flat,heart disease,Adult
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,female,54080,Bachelor,Retired,True,married,0,0.0,Rent room,endometriosis,Older
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,female,68785,High School,Retired,True,married,0,0.0,Rent flat,gastritis,Older


## Generalize zipcode 

In [8]:
is_k_anonymous(dff[['zipcode']], 2)# check the 2-anonymosity for this attribute

False

In [9]:
search = SearchEngine(simple_zipcode=True)

state = [search.by_zipcode(zipcode).to_dict()["state"] for zipcode in dff['zipcode']]
dff.drop('zipcode', axis=1, inplace=True)
dff['state'] = state
dff['state'].value_counts() # check the 2-anonymosity for this attribute

TX    63
CA    57
PA    52
NY    52
IL    37
FL    32
OH    31
IA    28
MN    27
NC    26
MI    26
AL    24
VA    24
IN    23
MO    23
WV    22
AR    20
KS    20
OK    19
CO    17
GA    17
KY    16
MS    15
WI    15
AZ    15
MT    14
WA    14
LA    14
NE    14
OR    12
TN    11
SC    11
NM     9
AK     9
UT     8
MD     7
HI     7
SD     7
DC     6
NV     5
ND     5
ID     5
WY     3
DE     1
Name: state, dtype: int64

In [10]:
dff.head()

Unnamed: 0,id,gender,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease,age_group,state
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,female,High School,Retired,True,married,2,0.0,Own house,Alzheimer's disease,Older,
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,female,PhD/md,Retired,True,married,1,0.0,Rent flat,multiple sclerosis,Older,
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,female,Bachelor,Employed,True,married,0,0.1,Rent flat,heart disease,Adult,IA
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,female,Bachelor,Retired,True,married,0,0.0,Rent room,endometriosis,Older,
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,female,High School,Retired,True,married,0,0.0,Rent flat,gastritis,Older,NE


## Generalize the education attribute

In [11]:
dff.education.value_counts()#get classes for education feature

High School              818
Bachelor                 603
Masters                  293
Less than High School    212
PhD/md                    74
Name: education, dtype: int64

In [12]:
dff.education = dff.education.replace({"Less than High School": "Less than\in High School", 
                       "Masters": "With\more than a master", 
                       "PhD/md": "With\more than a master",
                       "High School": "Less than\in High School"
                      })
print(dff.education.value_counts())
dff.head()

Less than\in High School    1030
Bachelor                     603
With\more than a master      367
Name: education, dtype: int64


Unnamed: 0,id,gender,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease,age_group,state
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,female,Less than\in High School,Retired,True,married,2,0.0,Own house,Alzheimer's disease,Older,
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,female,With\more than a master,Retired,True,married,1,0.0,Rent flat,multiple sclerosis,Older,
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,female,Bachelor,Employed,True,married,0,0.1,Rent flat,heart disease,Adult,IA
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,female,Bachelor,Retired,True,married,0,0.0,Rent room,endometriosis,Older,
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,female,Less than\in High School,Retired,True,married,0,0.0,Rent flat,gastritis,Older,NE


## Generalize the commute_times feature

In [13]:
#Get the count per each class
dff.commute_time.value_counts(ascending = True) # get unique values in order to see the distribution to the hours

3.43      39
3.10      49
2.77      67
2.10      86
0.77      98
0.10     101
1.77     101
0.43     102
1.43     102
2.43     102
1.10     110
0.00    1043
Name: commute_time, dtype: int64

In [14]:
def compute_commute_time(x):
    if x < 1:
        return 'less than 1h'
    if x < 2:
        return 'less than 2h'
    if x < 3:
        return 'less than 3h'
    if x >= 3:
        return 'more or equal than 3h'
dff.commute_time = dff.commute_time.apply(compute_commute_time)
print(dff.commute_time.value_counts())
dff.head()

less than 1h             1344
less than 2h              313
less than 3h              255
more or equal than 3h      88
Name: commute_time, dtype: int64


Unnamed: 0,id,gender,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease,age_group,state
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,female,Less than\in High School,Retired,True,married,2,less than 1h,Own house,Alzheimer's disease,Older,
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,female,With\more than a master,Retired,True,married,1,less than 1h,Rent flat,multiple sclerosis,Older,
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,female,Bachelor,Employed,True,married,0,less than 1h,Rent flat,heart disease,Adult,IA
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,female,Bachelor,Retired,True,married,0,less than 1h,Rent room,endometriosis,Older,
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,female,Less than\in High School,Retired,True,married,0,less than 1h,Rent flat,gastritis,Older,NE


## Generalize number_vehicles

In [15]:
#The classes counts for the number_vehicules attributes before generalization
print(dff.number_vehicles.value_counts())

dff.number_vehicles = dff.number_vehicles.replace({0: "0", 1: "1", 2: "More than 1", 3: "More than 1"})
dff.number_vehicles.value_counts()

1    786
0    730
2    451
3     33
Name: number_vehicles, dtype: int64


1              786
0              730
More than 1    484
Name: number_vehicles, dtype: int64

In [16]:
dff.head()

Unnamed: 0,id,gender,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease,age_group,state
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,female,Less than\in High School,Retired,True,married,More than 1,less than 1h,Own house,Alzheimer's disease,Older,
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,female,With\more than a master,Retired,True,married,1,less than 1h,Rent flat,multiple sclerosis,Older,
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,female,Bachelor,Employed,True,married,0,less than 1h,Rent flat,heart disease,Adult,IA
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,female,Bachelor,Retired,True,married,0,less than 1h,Rent room,endometriosis,Older,
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,female,Less than\in High School,Retired,True,married,0,less than 1h,Rent flat,gastritis,Older,NE


## Generalize accommodation

In [17]:
# classes count before generalization
print(dff.accommodation.value_counts())
dff.accommodation = dff.accommodation.replace({"Own house": "Own", "Own flat": "Own", "Rent flat": "Rent", "Rent house": "Rent", "Rent room": "Rent"})

# class count after generalization
dff.accommodation.value_counts()

Own house     707
Own flat      561
Rent flat     357
Rent house    268
Rent room     107
Name: accommodation, dtype: int64


Own     1268
Rent     732
Name: accommodation, dtype: int64

# Choose the set of features with the best entropy 

In [18]:
#the quasi-identifier used for the calculation of the entropy
feats = ["age_group", "state", "dob", "zipcode", "gender", "education", "employment", "children", "marital_status", "number_vehicles", "commute_time", "accommodation"]
init_entropy = entropy(df[feats[2:]])
print(f'The initial entropy: {init_entropy}')

The initial entropy: 10.965784284661922


In [19]:
feats.pop(3) # remove the 'zipcode' attributed not used for the transformed dataset
feats.pop(2) # remove the 'dob' attributed not used for the transformed dataset

# we are trying to find the best entropy
choice = {}
choice["entropy"] = 0

for L in range(7, len(feats)+1):
    for subset in itertools.combinations(feats, L):
        if not subset: continue
        e = entropy(dff[list(subset)])
        if e >= choice["entropy"]:
            min_entropy = e
            choice["subset"] = list(subset)
            choice["nb_feats"] = len(subset)
            choice["entropy"] = e
            print(f'The subset are {subset}')
            print(f'The entropy for these features: {choice["nb_feats"]},\nThe number of features: {e}')

#The best set of features
print(f'\n\n******The best subset is {subset}')
print(f'******The best entropy for these features: {choice["entropy"]},\n******The number of features: {choice["nb_feats"]}')




******The actual entropy with the transformed dataset: 4.697981031329513,
******The number of features: 10


The subset are ('age_group', 'state', 'gender', 'education', 'employment', 'children', 'marital_status')
The entropy for these features: 7,
The number of features: 4.4908916458832815
The subset are ('age_group', 'state', 'gender', 'education', 'employment', 'children', 'number_vehicles')
The entropy for these features: 7,
The number of features: 4.555922381303546
The subset are ('age_group', 'state', 'gender', 'education', 'employment', 'marital_status', 'number_vehicles')
The entropy for these features: 7,
The number of features: 4.6312065938186935
The subset are ('age_group', 'state', 'gender', 'education', 'marital_status', 'number_vehicles', 'commute_time')
The entropy for these features: 7,
The number of features: 4.6577163688230225
The subset are ('age_group', 'gender', 'education', 'employment', 'children', 'marital_status', 'number_vehicles')
The entropy for these feat

In [20]:
# the set of dataset with the best entropy
copy_feats = choice["subset"].copy()
copy_feats.append("disease") # add sensitive data
copy_feats.insert(0, "id") # add the identifier
copy_dff = dff[copy_feats] # copy th set of data interesting for the use case
copy_dff.head()

Unnamed: 0,id,age_group,gender,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,Older,female,Less than\in High School,Retired,True,married,More than 1,less than 1h,Own,Alzheimer's disease
1,b'\xcce%}~\x07\xbfXq\xdf\xf1\xa78\xa7\xe2]\xa9...,Older,female,With\more than a master,Retired,True,married,1,less than 1h,Rent,multiple sclerosis
2,b't\xff\xa4\xa0\x0b\xa6\t\xeaC\x0c\x0f\x95=y\x...,Adult,female,Bachelor,Employed,True,married,0,less than 1h,Rent,heart disease
3,b'7\xd3\xa8^\xf9\x0b\x08\x85/;|/\x9c\xee\x86\x...,Older,female,Bachelor,Retired,True,married,0,less than 1h,Rent,endometriosis
4,b'\xee\xa4\x83\xfcK\xd5c\xd3\xf9\xd6\x13\x83\x...,Older,female,Less than\in High School,Retired,True,married,0,less than 1h,Rent,gastritis


# check the k-diversity

In [21]:
# check if the dataset is 2-anonymous
print(f'The dataset is 2-anonymous: {is_k_anonymous(copy_dff[choice["subset"]], 2)}')

The dataset is 2-anonymous: False


In [22]:
# select classes which are not 2-anonymous and delete them
tmp = pd.DataFrame(dff.groupby(choice["subset"]).size().reset_index(name='counts'))
# The data to delete in order to have a 2-anonymous dataset
to_exclude = tmp.drop(tmp[tmp.counts >= 2].index)
to_exclude

Unnamed: 0,age_group,gender,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,counts
0,Adult,female,Bachelor,Employed,False,divorced,0,more or equal than 3h,Own,1
1,Adult,female,Bachelor,Employed,False,single,0,less than 1h,Rent,1
2,Adult,female,Bachelor,Employed,False,single,0,less than 2h,Rent,1
4,Adult,female,Bachelor,Employed,False,single,1,less than 3h,Own,1
5,Adult,female,Bachelor,Employed,False,single,More than 1,less than 3h,Own,1
...,...,...,...,...,...,...,...,...,...,...
659,Teen,female,Less than\in High School,Student,False,single,1,less than 2h,Rent,1
660,Teen,female,Less than\in High School,Unemployed,False,single,1,less than 1h,Own,1
662,Teen,male,Less than\in High School,Student,False,single,0,less than 3h,Rent,1
664,Teen,male,Less than\in High School,Student,False,single,1,less than 3h,Own,1


In [23]:
copy_dff = pd.merge(copy_dff, to_exclude, how="outer", indicator=True, on=choice["subset"])
copy_dff = copy_dff.loc[copy_dff._merge == 'left_only'].drop(["_merge", "counts"], axis=1)
copy_dff.head()

Unnamed: 0,id,age_group,gender,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,b'\xaf\xb9%\x8c.\xcc^|\xcf\xa38\xd5u\xa0\x80\x...,Older,female,Less than\in High School,Retired,True,married,More than 1,less than 1h,Own,Alzheimer's disease
1,b'%c\x90\xd5\xc7\xbd|.P\x19$\xdb\x9e\xd1\x93\x...,Older,female,Less than\in High School,Retired,True,married,More than 1,less than 1h,Own,Alzheimer's disease
2,b'\xdd\xc2\x9c\xc6\xdb\xac~s\xdfa\xce\xdb\xfe5...,Older,female,Less than\in High School,Retired,True,married,More than 1,less than 1h,Own,kidney disease
3,b'\x15\xe9\xdao]\xea\x19EbvH\x01\xc9\x1a\xdf\x...,Older,female,Less than\in High School,Retired,True,married,More than 1,less than 1h,Own,endometriosis
4,b'\xc6\xbb\x1d\x14\xban\xc5\x92&\xee\xae\xe8\x...,Older,female,Less than\in High School,Retired,True,married,More than 1,less than 1h,Own,skin cancer


In [24]:
# Now we check if we are 2-anonymous with the new dataset
print(f'The new dataset is 2-anonymous: {is_k_anonymous(copy_dff[choice["subset"]], 2)}')

The new dataset is 2-anonymous: True


In [25]:
# The size of the new dataset
copy_dff.shape

(1624, 11)

# Check the l-diversity

In [26]:
sensitive_feat = "disease"
print(f'Is the dataset 2-diverse: {is_l_diverse(copy_dff, choice["subset"], sensitive_feat, 2)}')

Is the dataset 2-diverse: False


In [27]:
l_df = copy_dff.copy()

# delete classes that are not 2-diverse
k_anonymous_class_counts = pd.DataFrame(copy_dff.groupby(choice["subset"]).size().reset_index(name='counts'))
for i in range(k_anonymous_class_counts.shape[0]):
    quasi_feats = choice["subset"].copy()
    cls = pd.DataFrame(k_anonymous_class_counts.iloc[[i]])
    tmp = pd.merge(copy_dff, cls, how="inner", indicator=True, on=quasi_feats) # get data for one class
    l_diversity_counts = tmp.groupby([sensitive_feat]).size() # get sub_classes for each sensitive data
    if len(l_diversity_counts) < 2: # check if 2-diverse, else update the dataframe by deleting classes which are not 2-diverse
        l_df = pd.merge(l_df, cls, how="outer", indicator=True, on=choice["subset"])
        l_df = l_df.loc[l_df._merge == 'left_only'].drop(["_merge", "counts"], axis=1) 
    
print(f'Is the dataset 2-diverse: {is_l_diverse(l_df, choice["subset"], sensitive_feat, 2)}')
print(f'The new dataset is 2-anonymous: {is_k_anonymous(l_df[choice["subset"]], 2)}')

Is the dataset 2-diverse: True
The new dataset is 2-anonymous: True


In [28]:
#The sharpe of the new dataset
l_df.shape

(1591, 11)

# Save the anonymized dataset

In [29]:
l_df.to_csv("use_case1.csv")