#### This notebook prepare the Zindi Dataset and do the 5 fold split

After we get the 5 fold data, we then save it to the csv file

In [1]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
import numpy as np
from tqdm import tqdm
from warnings import warn
import pandas as pd
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric, BinaryLabelDatasetMetric
from aif360.algorithms.postprocessing.reject_option_classification\
        import RejectOptionClassification
from common_utils import compute_metrics

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from ipywidgets import interactive, FloatSlider

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'


## Huangrui's financial-inclusion-in-africa

In [2]:
class Zindi(StandardDataset):
    """financial-inclusion-in-africa dataset.
    """

    def __init__(self, label_name='Y', favorable_classes=[1],  
                 protected_attribute_names=['sex'],
                 privileged_classes=[[1]],
                 instance_weights_name=None,
                 categorical_features=["location_type","country","relationship_with_head","marital_status","education_level","job_type"],
                 features_to_drop=[],
                 features_to_keep=[],
                 na_values=[], custom_preprocessing=None,
                 metadata=None):
        """See :obj:`RegressionDataset` for a description of the arguments."""
        

        train = pd.read_csv('./Huangrui/zindi/Train.csv')
        test = pd.read_csv('./Huangrui/zindi/Test.csv')
        df = pd.concat([train, test], axis=0, ignore_index=True)
        df.rename(columns={'gender_of_respondent':'sex','age_of_respondent':'age', 'bank_account':'Y' }, inplace=True)
        df.Y = df.Y.map({'Yes':1, 'No':0})
        df.sex = df.sex.map({"Male":1,"Female":0})
        df.cellphone_access = df.cellphone_access.map({'Yes':1, 'No':0})
        df.drop(['uniqueid'], axis=1, inplace=True)
        df.dropna(inplace=True)

        super(Zindi, self).__init__(
            df=df, label_name=label_name,
            favorable_classes=favorable_classes,
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes,
            instance_weights_name=instance_weights_name,
            categorical_features=categorical_features,
            features_to_keep=features_to_keep,
            features_to_drop=features_to_drop, na_values=na_values,
            custom_preprocessing=custom_preprocessing, metadata=metadata)



#### Load dataset and specify options

In [3]:
dataset_orig = Zindi()
privileged_groups = [{'sex': 1}]
unprivileged_groups = [{'sex': 0}]
# Metric used (should be one of allowed_metrics)
metric_name = "Equal opportunity difference"
#random seed for calibrated equal odds prediction
random_seed = 12345679
np.random.seed(random_seed)
# Upper and lower bound on the fairness metric used
metric_ub = 0.05
metric_lb = -0.05

In [4]:
dataset_orig.feature_names

['year',
 'cellphone_access',
 'household_size',
 'age',
 'sex',
 'country=Kenya',
 'country=Rwanda',
 'country=Tanzania',
 'country=Uganda',
 'location_type=Rural',
 'location_type=Urban',
 'relationship_with_head=Child',
 'relationship_with_head=Head of Household',
 'relationship_with_head=Other non-relatives',
 'relationship_with_head=Other relative',
 'relationship_with_head=Parent',
 'relationship_with_head=Spouse',
 'marital_status=Divorced/Seperated',
 'marital_status=Dont know',
 'marital_status=Married/Living together',
 'marital_status=Single/Never Married',
 'marital_status=Widowed',
 'education_level=No formal education',
 'education_level=Other/Dont know/RTA',
 'education_level=Primary education',
 'education_level=Secondary education',
 'education_level=Tertiary education',
 'education_level=Vocational/Specialised training',
 'job_type=Dont Know/Refuse to answer',
 'job_type=Farming and Fishing',
 'job_type=Formally employed Government',
 'job_type=Formally employed Priva

In [5]:
# 5 fold cross validation
Z =  dataset_orig.split(5, shuffle=True,seed = random_seed)
# i th fold
dataset_train1 = Z[0].copy()
dataset_train1.features = np.concatenate((Z[0].features,Z[1].features,Z[2].features,Z[3].features),axis=0)
dataset_train1.scores = np.concatenate((Z[0].scores,Z[1].scores,Z[2].scores,Z[3].scores),axis=0)
dataset_train1.labels = np.concatenate((Z[0].labels,Z[1].labels,Z[2].labels,Z[3].labels),axis=0)
dataset_train1.protected_attributes = np.concatenate((Z[0].protected_attributes,Z[1].protected_attributes,Z[2].protected_attributes,Z[3].protected_attributes),axis=0)
dataset_train1.instance_weights = np.concatenate((Z[0].instance_weights,Z[1].instance_weights,Z[2].instance_weights,Z[3].instance_weights),axis=0)
dataset_train1.instance_names = np.concatenate((Z[0].instance_names,Z[1].instance_names,Z[2].instance_names,Z[3].instance_names),axis=0)
dataset_train1.metadata = Z[0].metadata.copy()
dataset_test1= Z[4].copy()

dataset_train2 = Z[1].copy()
dataset_train2.features = np.concatenate((Z[1].features,Z[2].features,Z[3].features,Z[4].features),axis=0)
dataset_train2.scores = np.concatenate((Z[1].scores,Z[2].scores,Z[3].scores,Z[4].scores),axis=0)
dataset_train2.labels = np.concatenate((Z[1].labels,Z[2].labels,Z[3].labels,Z[4].labels),axis=0)
dataset_train2.protected_attributes = np.concatenate((Z[1].protected_attributes,Z[2].protected_attributes,Z[3].protected_attributes,Z[4].protected_attributes),axis=0)
dataset_train2.instance_weights = np.concatenate((Z[1].instance_weights,Z[2].instance_weights,Z[3].instance_weights,Z[4].instance_weights),axis=0)
dataset_train2.instance_names = np.concatenate((Z[1].instance_names,Z[2].instance_names,Z[3].instance_names,Z[4].instance_names),axis=0)
dataset_train2.metadata = Z[1].metadata.copy()
dataset_test2= Z[0].copy()

dataset_train3 = Z[2].copy()
dataset_train3.features = np.concatenate((Z[2].features,Z[3].features,Z[4].features,Z[0].features),axis=0)
dataset_train3.scores = np.concatenate((Z[2].scores,Z[3].scores,Z[4].scores,Z[0].scores),axis=0)
dataset_train3.labels = np.concatenate((Z[2].labels,Z[3].labels,Z[4].labels,Z[0].labels),axis=0)
dataset_train3.protected_attributes = np.concatenate((Z[2].protected_attributes,Z[3].protected_attributes,Z[4].protected_attributes,Z[0].protected_attributes),axis=0)
dataset_train3.instance_weights = np.concatenate((Z[2].instance_weights,Z[3].instance_weights,Z[4].instance_weights,Z[0].instance_weights),axis=0)
dataset_train3.instance_names = np.concatenate((Z[2].instance_names,Z[3].instance_names,Z[4].instance_names,Z[0].instance_names),axis=0)
dataset_train3.metadata = Z[2].metadata.copy()
dataset_test3= Z[1].copy()

dataset_train4 = Z[3].copy()
dataset_train4.features = np.concatenate((Z[3].features,Z[4].features,Z[0].features,Z[1].features),axis=0)
dataset_train4.scores = np.concatenate((Z[3].scores,Z[4].scores,Z[0].scores,Z[1].scores),axis=0)
dataset_train4.labels = np.concatenate((Z[3].labels,Z[4].labels,Z[0].labels,Z[1].labels),axis=0)
dataset_train4.protected_attributes = np.concatenate((Z[3].protected_attributes,Z[4].protected_attributes,Z[0].protected_attributes,Z[1].protected_attributes),axis=0)
dataset_train4.instance_weights = np.concatenate((Z[3].instance_weights,Z[4].instance_weights,Z[0].instance_weights,Z[1].instance_weights),axis=0)
dataset_train4.instance_names = np.concatenate((Z[3].instance_names,Z[4].instance_names,Z[0].instance_names,Z[1].instance_names),axis=0)
dataset_train4.metadata = Z[3].metadata.copy()
dataset_test4= Z[2].copy()

dataset_train5 = Z[4].copy()
dataset_train5.features = np.concatenate((Z[4].features,Z[0].features,Z[1].features,Z[2].features),axis=0)
dataset_train5.scores = np.concatenate((Z[4].scores,Z[0].scores,Z[1].scores,Z[2].scores),axis=0)
dataset_train5.labels = np.concatenate((Z[4].labels,Z[0].labels,Z[1].labels,Z[2].labels),axis=0)
dataset_train5.protected_attributes = np.concatenate((Z[4].protected_attributes,Z[0].protected_attributes,Z[1].protected_attributes,Z[2].protected_attributes),axis=0)
dataset_train5.instance_weights = np.concatenate((Z[4].instance_weights,Z[0].instance_weights,Z[1].instance_weights,Z[2].instance_weights),axis=0)
dataset_train5.instance_names = np.concatenate((Z[4].instance_names,Z[0].instance_names,Z[1].instance_names,Z[2].instance_names),axis=0)
dataset_train5.metadata = Z[4].metadata.copy()
dataset_test5= Z[3].copy()

#### Save the split dataset

In [6]:
protected_name = "sex"
dataset_train1.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_train1.csv".format(protected_name),index=False)
dataset_test1.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_test1.csv".format(protected_name),index=False)
dataset_train2.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_train2.csv".format(protected_name),index=False)
dataset_test2.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_test2.csv".format(protected_name),index=False)
dataset_train3.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_train3.csv".format(protected_name),index=False)
dataset_test3.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_test3.csv".format(protected_name),index=False)
dataset_train4.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_train4.csv".format(protected_name),index=False)
dataset_test4.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_test4.csv".format(protected_name),index=False)
dataset_train5.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_train5.csv".format(protected_name),index=False)
dataset_test5.convert_to_dataframe()[0].to_csv("Huangrui/zindi/{}/zindi_test5.csv".format(protected_name),index=False)
