In [18]:
import pandas as pd
from mondrian import anonymize_df
import numpy as np

In [4]:
# Cargamos los datos del dataset
col_names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship',
         'race','gender','capital-gain','capital-loss','hours-per-week','native-country','income']
categorical = ['workclass','education','marital-status','occupation','relationship','gender','native-country','race','income']
to_keep = ['age','workclass','education','marital-status','occupation','race','gender','native-country','income'] # columns used in the paper
categorical_to_keep = [x for x in categorical if x in to_keep]
df = pd.read_csv("adult.txt", sep=",", header=None, names=col_names, index_col=False, engine='python')

In [5]:
zip_iterator = zip([x for x in range(9)], to_keep)
col_dict = dict(zip_iterator)

In [6]:
cat_dict = {k: v for k, v in col_dict.items() if v in categorical_to_keep}
cat_indices = [k for k, v in cat_dict.items()]

In [7]:
# Quitar valores vacios
df = df.loc[:,to_keep]
df = df[df!='-1']
df.dropna(inplace=True)
df.reset_index(inplace=True)
df = df.iloc[:,1:]
df

Unnamed: 0,age,workclass,education,marital-status,occupation,race,gender,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,White,Male,United-States,<=50k
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,United-States,<=50k
2,38,Private,HS-grad,Divorced,Handlers-cleaners,White,Male,United-States,<=50k
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Black,Male,United-States,<=50k
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Black,Female,Cuba,<=50k
...,...,...,...,...,...,...,...,...,...
45217,33,Private,Bachelors,Never-married,Prof-specialty,White,Male,United-States,<=50k
45218,39,Private,Bachelors,Divorced,Prof-specialty,White,Female,United-States,<=50k
45219,38,Private,Bachelors,Married-civ-spouse,Prof-specialty,White,Male,United-States,<=50k
45220,44,Private,Bachelors,Divorced,Adm-clerical,Asian-Pac-Islander,Male,United-States,<=50k


In [9]:
m = df.groupby(df.columns.tolist(),as_index=False).size()['size'].max()
df = np.array(df)

In [11]:
# OBtener el numero de tipos de elementos de cada columna no importa si son categoricos o numericos.
def colSpans(df, dimensions, cat_dict, partition):
  
  categorical = [k for k, v in cat_dict.items() if k in dimensions]
  spans = dict()
  
  for column in dimensions:
    dfp = df[partition,column]
    if column in categorical:
      span = len(np.unique(dfp))
    else:
      span = np.max(dfp)-np.min(dfp)
    spans[column] = span
  
  return spans

In [12]:
# Dividir una particion teniendo en cuenta lla media y el modo.
def splitVal(df, dim, part, cat_dict, mode):
  
  dfp = df[part,dim] 
  unique = list(np.unique(dfp))
  length = len(unique)
  categorical = [k for k, v in cat_dict.items()]
  
  if dim in categorical: # variables categoricas
    
    if mode=='strict':
      lhv = unique[:length//2]
      rhv = unique[length//2:]
      lhs_v = list(list(np.where(np.isin(dfp,lhv)))[0]) 
      rhs_v = list(list(np.where(np.isin(dfp,rhv)))[0])
      lhs = [part[i] for i in lhs_v]
      rhs = [part[i] for i in rhs_v]
    
    elif mode=='relaxed':
      lhv = unique[:length//2]
      rhv = unique[length//2:]
      lhs_v = list(list(np.where(np.isin(dfp,lhv)))[0])
      rhs_v = list(list(np.where(np.isin(dfp,rhv)))[0])
      lhs = [part[i] for i in lhs_v]
      rhs = [part[i] for i in rhs_v]
      diff = len(lhs)-len(rhs)
      if diff==0:
        pass
      elif diff<0:
        lhs1 = rhs[:(np.abs(diff)//2)]
        rhs = rhs[(np.abs(diff)//2):] 
        lhs = np.concatenate((lhs,lhs1))
      else:
        rhs1 = lhs[-(diff//2):]
        lhs = lhs[:-(diff//2)]
        rhs = np.concatenate((rhs,rhs1))
    
    else:
      lhs, rhs = splitVal(df, dim, part, 'relaxed')
  
  else: # Se dividen los datos basandose en la media para las variables numericas.
    median = np.median(dfp)
    
    if mode=='strict':
      lhs_v = list(list(np.where(dfp < median))[0])
      rhs_v = list(list(np.where(dfp >= median))[0])
      lhs = [part[i] for i in lhs_v]
      rhs = [part[i] for i in rhs_v]
    
    elif mode=='relaxed':
      lhs_v = list(list(np.where(dfp < median))[0])
      rhs_v = list(list(np.where(dfp > median))[0])
      median_v = list(list(np.where(dfp == median))[0])
      lhs_p = [part[i] for i in lhs_v]
      rhs_p = [part[i] for i in rhs_v]
      median_p = [part[i] for i in median_v]
      diff = len(lhs_p)-len(rhs_p)
      if diff<0:
        med_lhs = np.random.choice(median_p, size=np.abs(diff), replace=False) 
        med_to_split = [i for i in median_p if i not in med_lhs]
        lhs_p = np.concatenate((lhs_p,med_lhs))
      else:
        med_rhs = np.random.choice(median_p, size=np.abs(diff), replace=False)
        med_to_split = [i for i in median_p if i not in med_rhs]
        rhs_p = np.concatenate((rhs_p,med_rhs))
      med_lhs_1 = np.random.choice(med_to_split, size=(len(med_to_split)//2), replace=False)
      med_rhs_1 = [i for i in med_to_split if i not in med_lhs_1]
      lhs = np.concatenate((lhs_p,med_lhs_1))
      rhs = np.concatenate((rhs_p,med_rhs_1))
    
    else: # fall back to relaxed mode
      lhs, rhs = splitVal(df, dim, part, 'relaxed')
  
  return [int(x) for x in lhs], [int(x) for x in rhs]

In [13]:
# Crear las particiones
def partitioning(df, dimensions, k, cat_dict, mode):
    
  final_partitions = []
  working_partitions = [[x for x in range(len(df))]]

  while len(working_partitions) > 0:
    
    partition = working_partitions[0]
    working_partitions = working_partitions[1:]

    if len(partition) < 2*k:
      final_partitions.append(partition)
    
    else:
      spans = colSpans(df, dimensions, cat_dict, partition)
      ordered_span_cols = sorted(spans.items(), key=lambda x:x[1], reverse=True)
      
      for dim, _ in ordered_span_cols:
        lhs, rhs = splitVal(df, dim, partition, cat_dict, mode)
        if len(lhs) >= k and len(rhs) >= k:
            working_partitions.append(lhs) 
            working_partitions.append(rhs)
            break 
      
      else:
        final_partitions.append(partition)
  return final_partitions

In [15]:
# Construir las particiones
k = 5

quasi_identifiers_cols = ['age','workclass','education','marital-status','occupation','race','gender','native-country']
quasi_identifiers = {k: v for k, v in col_dict.items() if v in quasi_identifiers_cols} 
quasi_identifiers_ix = [k for k, v in col_dict.items() if v in quasi_identifiers_cols] 

sensitive_data_cols = ['income']
sensitive_data = {k: v for k, v in col_dict.items() if v in sensitive_data_cols}
sensitive_data_ix = [k for k, v in col_dict.items() if v in sensitive_data_cols] 

equivalence_classes = partitioning(df, quasi_identifiers_ix, k, cat_dict, 'relaxed')

In [20]:
dfn = anonymize_df(df, equivalence_classes, quasi_identifiers_ix, sensitive_data_ix, cat_dict)

In [22]:
df[:5]

array([[39, 'State-gov', 'Bachelors', 'Never-married', 'Adm-clerical',
        'White', 'Male', 'United-States', '<=50k'],
       [50, 'Self-emp-not-inc', 'Bachelors', 'Married-civ-spouse',
        'Exec-managerial', 'White', 'Male', 'United-States', '<=50k'],
       [38, 'Private', 'HS-grad', 'Divorced', 'Handlers-cleaners',
        'White', 'Male', 'United-States', '<=50k'],
       [53, 'Private', '11th', 'Married-civ-spouse', 'Handlers-cleaners',
        'Black', 'Male', 'United-States', '<=50k'],
       [28, 'Private', 'Bachelors', 'Married-civ-spouse',
        'Prof-specialty', 'Black', 'Female', 'Cuba', '<=50k']],
      dtype=object)

In [23]:
dfn[:5]

array([['37-40', 'Private,State-gov',
        'Assoc-acdm,Bachelors,Some-college',
        'Divorced,Married-civ-spouse,Never-married',
        'Adm-clerical,Craft-repair,Exec-managerial',
        'Asian-Pac-Islander,White', 'Male', 'Iran,United-States',
        '<=50k'],
       ['47-50', 'Private,Self-emp-not-inc',
        'Bachelors,HS-grad,Some-college',
        'Married-civ-spouse,Never-married,Separated',
        'Adm-clerical,Craft-repair,Exec-managerial',
        'Asian-Pac-Islander,White', 'Female,Male',
        'India,Iran,Ireland,United-States', '<=50k'],
       ['38-40', 'Private,Self-emp-inc,Self-emp-not-inc',
        'HS-grad,Masters,Some-college',
        'Divorced,Married-civ-spouse,Never-married',
        'Craft-repair,Exec-managerial,Handlers-cleaners', 'White',
        'Female,Male', 'Iran,Ireland,United-States', '<=50k'],
       ['51-55', 'Private,Self-emp-inc', '11th,1st-4th,7th-8th,9th',
        'Married-civ-spouse,Married-spouse-absent,Never-married,Widowed',
    