##

## Adult Data

In [2]:
import numpy as np
import torch
import pandas as pd
import os

In [3]:
def process_csv(filename, label_name, favorable_class, sensitive_attributes, privileged_classes, categorical_attributes, continuous_attributes, features_to_keep, na_values = [], header = 'infer', columns = None):
    """
    from https://github.com/yzeng58/Improving-Fairness-via-Federated-Learning/blob/main/FedFB/DP_load_dataset.py
    process the adult file: scale, one-hot encode
    only support binary sensitive attributes -> [gender, race] -> 4 sensitive groups 
    """
    skiprows = 1 if filename.endswith('test') else 0
    df = pd.read_csv(os.path.join(filename), delimiter = ',', header = header, na_values = na_values, skiprows=skiprows)
    if header == None: df.columns = columns
    df = df[features_to_keep]

    # apply one-hot encoding to convert the categorical attributes into vectors
    df = pd.get_dummies(df, columns = categorical_attributes)

    # normalize numerical attributes to the range within [0, 1]
    def scale(vec):
        minimum = min(vec)
        maximum = max(vec)
        return (vec-minimum)/(maximum-minimum)
    
    df[continuous_attributes] = df[continuous_attributes].apply(scale, axis = 0)
    df.loc[df[label_name] != favorable_class, label_name] = 0
    df.loc[df[label_name] == favorable_class, label_name] = 1
    df[label_name] = df[label_name].astype('category').cat.codes
    df['sex'] = df['sex'].map({' Male':0, ' Female':1}).astype('category')
    return df

In [4]:
# Adult
sensitive_attributes = ['sex']
categorical_attributes = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
continuous_attributes = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
features_to_keep = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
            'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss','hours-per-week', 
            'native-country', 'salary']
label_name = 'salary'

adult = process_csv('adult.data', label_name, ' >50K', sensitive_attributes, [' Female'], categorical_attributes, continuous_attributes, features_to_keep, na_values = [], header = None, columns = features_to_keep)
test = process_csv('adult.test', label_name, ' >50K.', sensitive_attributes, [' Female'], categorical_attributes, continuous_attributes, features_to_keep, na_values = [], header = None, columns = features_to_keep) # the distribution is very different from training distribution
test['native-country_ Holand-Netherlands'] = 0
test = test[adult.columns]

adult_num_features = len(adult.columns)-1



In [5]:
adult.to_csv('train.csv', index=None)
test.to_csv('test.csv', index=None)

In [5]:
df = pd.read_csv('train.csv')
df1 = df.drop('salary',axis=1)

In [6]:
a = df.to_numpy()

In [7]:
a.shape

(32561, 108)

In [8]:
df['salary']

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: salary, Length: 32561, dtype: int64

In [9]:
for i in df.drop('salary', axis=1).columns:
    if i.startswith('race'):
        print(i)

race_ Amer-Indian-Eskimo
race_ Asian-Pac-Islander
race_ Black
race_ Other
race_ White


In [10]:
df

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary,workclass_ ?,workclass_ Federal-gov,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.301370,0.044302,0.800000,0,0.021740,0.0,0.397959,0,False,False,...,False,False,False,False,False,False,False,True,False,False
1,0.452055,0.048238,0.800000,0,0.000000,0.0,0.122449,0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,0.287671,0.138113,0.533333,0,0.000000,0.0,0.397959,0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,0.493151,0.151068,0.400000,0,0.000000,0.0,0.397959,0,False,False,...,False,False,False,False,False,False,False,True,False,False
4,0.150685,0.221488,0.800000,1,0.000000,0.0,0.397959,0,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.166404,0.733333,1,0.000000,0.0,0.377551,0,False,False,...,False,False,False,False,False,False,False,True,False,False
32557,0.315068,0.096500,0.533333,0,0.000000,0.0,0.397959,1,False,False,...,False,False,False,False,False,False,False,True,False,False
32558,0.561644,0.094827,0.533333,1,0.000000,0.0,0.397959,0,False,False,...,False,False,False,False,False,False,False,True,False,False
32559,0.068493,0.128499,0.533333,0,0.000000,0.0,0.193878,0,False,False,...,False,False,False,False,False,False,False,True,False,False


In [23]:
sum((df['race_ Black'] + df['sex'] * 2) == 2)

9216