In [1]:
import ipynb.fs.full.provenance as pr
import ipynb.fs.full.provenance_lib as pr_lib
import pandas as pd
import numpy as np

from IPython.display import Image

# Specify where to save the processed files as savepath
savepath = 'results/GermanCredit_prov/'

In [2]:
# Files get loaded from fairCorrect github repository
url = 'https://raw.githubusercontent.com/vladoxNCL/fairCorrect/master/Datasets/'
df = pd.read_csv(url + 'german.csv', header=None)

m, n= df.shape
print("IN - Rows: ", m, " Features: ",n)

# Data needed column names
df.columns = ['checking', 'duration', 'credit_history', 'purpose', 'credit_amount',
              'savings', 'employment', 'inst_rate', 'personal_status', 'other_debtors',
              'residence_time', 'property', 'age', 'other_inst', 'housing', 'num_credits',
              'job', 'dependants', 'phone', 'foreigner', 'label']

#df = df[:3]
#df = df[['checking','personal_status']]

# Create a new provenance document and input entities 
p = pr.Provenance(df, savepath)
#p = pr_lib.Provenance(df, savepath)

IN - Rows:  1000  Features:  21


In [3]:
# Turn criptic values into interpretable form
df = df.replace({'checking': {'A11': 'check_low', 'A12': 'check_mid', 'A13': 'check_high',
                              'A14': 'check_none'},
                 'credit_history': {'A30': 'debt_none', 'A31': 'debt_noneBank',
                                    'A32': 'debt_onSchedule','A33': 'debt_delay',
                                    'A34': 'debt_critical'},
                 'purpose': {'A40': 'pur_newCar', 'A41': 'pur_usedCar',
                             'A42': 'pur_furniture', 'A43': 'pur_tv',
                             'A44': 'pur_appliance', 'A45': 'pur_repairs',
                             'A46': 'pur_education', 'A47': 'pur_vacation',
                             'A48': 'pur_retraining', 'A49': 'pur_business',
                             'A410': 'pur_other'},
                 'savings': {'A61': 'sav_small', 'A62': 'sav_medium', 'A63': 'sav_large',
                             'A64': 'sav_xlarge', 'A65': 'sav_none'},
                 'employment': {'A71': 'emp_unemployed', 'A72': 'emp_lessOne',
                                'A73': 'emp_lessFour', 'A74': 'emp_lessSeven',
                                'A75': 'emp_moreSeven'},
                 'other_debtors': {'A101': 'debtor_none', 'A102': 'debtor_coApp',
                                   'A103': 'debtor_guarantor'},
                 'property': {'A121': 'prop_realEstate', 'A122': 'prop_agreement',
                              'A123': 'prop_car', 'A124': 'prop_none'},
                 'other_inst': {'A141': 'oi_bank', 'A142': 'oi_stores', 'A143': 'oi_none'},
                 'housing': {'A151': 'hous_rent', 'A152': 'hous_own', 'A153': 'hous_free'},
                 'job': {'A171': 'job_unskilledNR', 'A172': 'job_unskilledR',
                         'A173': 'job_skilled', 'A174': 'job_highSkill'},
                 'phone': {'A191': 0, 'A192': 1},
                 'foreigner': {'A201': 1, 'A202': 0},
                 'label': {2: 0}})
col = ['checking', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property', 'other_inst', 'housing', 'job', 'phone', 'foreigner', 'label']

d = p.get_prov_feature_transformation(df, col)

get_prov_feature_transformation function took 6120.035 ms


In [4]:
# More criptic values translating
df['status'] = np.where(df.personal_status == 'A91', 'divorced',
                        np.where(df.personal_status == 'A92', 'divorced', 
                                 np.where(df.personal_status == 'A93', 'single',
                                          np.where(df.personal_status == 'A95', 'single',
                                                   'married'))))

# Translate gender values
df['gender'] = np.where(df.personal_status == 'A92', 0,
                        np.where(df.personal_status == 'A95', 0,
                                 1))

d = p.get_prov_space_transformation(df, ['personal_status'])

get_prov_space_transformation function took 1072.683 ms


In [5]:
# Drop personal_status column
df = df.drop(['personal_status'], axis=1)

d = p.get_prov_dim_reduction(df)

get_prov_dim_reduction function took 240.753 ms


In [6]:
# One-hot encode categorical columns
col = ['checking', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property',
       'other_inst', 'housing', 'job', 'status']
onehot_col_map = {}
for c in col:
    #Get map column-val
    unique_val = df[c].unique().tolist()
    onehot_col_map[c] = unique_val
    
    dummies = []
    dummies.append(pd.get_dummies(df[c]))
    df_dummies = pd.concat(dummies, axis = 1)
    df = pd.concat((df, df_dummies), axis = 1)
    df = df.drop([c], axis = 1)
    
d = p.get_prov_onehot_encode(df, col, onehot_col_map)

get_prov_onehot_encode function took 22255.401 ms


In [7]:
# Uncomment bottom line to save clean csv
df.to_csv(savepath + 'german_onehot.csv', index=False)

#namefile = savepath + 'GermanCredit_prov'
#p.save_all_graph(namefile)
#Image(namefile + '.png')