## This notebook demonstrates how to remove the bias during "Pre-processing stage" using AI 360 fairness toolkit by adjusting the weights

### Pre-processing algorithm
A bias mitigation algorithm that is applied to training data.

### Insert your credentials as credentials in the below cell
Click on dropdown from Pipeline_LabelEncoder-0.1.zip under Data tab and select 'Credentials'

In [1]:
# @hidden_cell
# The following code contains the credentials for a file in your IBM Cloud Object Storage.
# You might want to remove those credentials before you share your notebook.
credentials = {

}


In [2]:
from ibm_botocore.client import Config
import ibm_boto3

cos = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=credentials['IBM_API_KEY_ID'],
    ibm_service_instance_id=credentials['IAM_SERVICE_ID'],
    ibm_auth_endpoint=credentials['IBM_AUTH_ENDPOINT'],
    config=Config(signature_version='oauth'),
    endpoint_url=credentials['ENDPOINT'])

In [3]:
import os
os.getcwd()

'/home/wsuser/work'

In [4]:
cos.download_file(Bucket=credentials['BUCKET'],Key='Pipeline_LabelEncoder-0.1.zip',Filename='/home/wsuser/work/Pipeline_LabelEncoder-0.1.zip')

In [5]:
!ls

Pipeline_LabelEncoder-0.1.zip


In [6]:
!pip install Pipeline_LabelEncoder-0.1.zip
!pip install aif360

Processing ./Pipeline_LabelEncoder-0.1.zip
Building wheels for collected packages: Pipeline-LabelEncoder
  Building wheel for Pipeline-LabelEncoder (setup.py) ... [?25ldone
[?25h  Created wheel for Pipeline-LabelEncoder: filename=Pipeline_LabelEncoder-0.1-py3-none-any.whl size=2062 sha256=eb9fb24f88e94829b3981c18a5729e6e1ace2c094aa0fb6b85551ccbf8052fef
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/a1/1a/b1/66d8f1917ec5b09eb70adf911c60dec54820888fd7cb9941ad
Successfully built Pipeline-LabelEncoder
Installing collected packages: Pipeline-LabelEncoder
  Attempting uninstall: Pipeline-LabelEncoder
    Found existing installation: Pipeline-LabelEncoder 0.1
    Uninstalling Pipeline-LabelEncoder-0.1:
      Successfully uninstalled Pipeline-LabelEncoder-0.1
Successfully installed Pipeline-LabelEncoder-0.1


In [7]:
import tensorflow as tf
tf.__version__

'1.15.4'

In [8]:
!pip install 'tensorflow>=1.13.1,< 2'



In [9]:
import pandas as pd
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import BinaryLabelDataset
from IPython.display import Markdown, display
from aif360.algorithms.preprocessing.reweighing import Reweighing
import warnings
warnings.filterwarnings("ignore")

pip install 'aif360[LFR]'


### Insert the data as Pandas Dataframe and change the name from df_data_ to df

In [10]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_943f0d0348cb4b5fb50f9c338e8d8cc1 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='mecmxiDlkkNBAx_xwFoWofXDFgfjwrYliwcdLryqmvu7',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_943f0d0348cb4b5fb50f9c338e8d8cc1.get_object(Bucket='ai360series-donotdelete-pr-1zay1c1uizeatc',Key='fraud_data.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df = pd.read_csv(body)
df.head()


Unnamed: 0,Gender,Married,Education,Fraud_risk
0,Male,No,1,Risk
1,Male,Yes,1,Safe
2,Male,Yes,1,Safe
3,Male,Yes,0,Safe
4,Male,No,1,Risk


In [11]:
df.describe(include = 'all')

Unnamed: 0,Gender,Married,Education,Fraud_risk
count,921,921,921.0,921
unique,2,2,,2
top,Male,No,,Safe
freq,703,501,,562
mean,,,0.730727,
std,,,0.443823,
min,,,0.0,
25%,,,0.0,
50%,,,1.0,
75%,,,1.0,


In [12]:
privileged_groups = [{'Gender': 1}]
unprivileged_groups = [{'Gender': 0}]
favorable_label = 1 
unfavorable_label = 0

In [13]:
from sklearn import preprocessing
categorical_column = ['Gender', 'Married', 'Fraud_risk']

data_encoded = df.copy(deep=True)
#Use Scikit-learn label encoding to encode character data
lab_enc = preprocessing.LabelEncoder()
for col in categorical_column:
        data_encoded[col] = lab_enc.fit_transform(df[col])
        le_name_mapping = dict(zip(lab_enc.classes_, lab_enc.transform(lab_enc.classes_)))
        print('Feature', col)
        print('mapping', le_name_mapping)
        

data_encoded.head()

Feature Gender
mapping {'Female': 0, 'Male': 1}
Feature Married
mapping {'No': 0, 'Yes': 1}
Feature Fraud_risk
mapping {'Risk': 0, 'Safe': 1}


Unnamed: 0,Gender,Married,Education,Fraud_risk
0,1,0,1,0
1,1,1,1,1
2,1,1,1,1
3,1,1,0,1
4,1,0,1,0


In [14]:
from Pipeline_LabelEncoder.sklearn_label_encoder import PipelineLabelEncoder
preprocessed_data = PipelineLabelEncoder(columns = ['Gender','Married', 'Fraud_risk']).fit_transform(data_encoded)
print('-------------------------')
#print('validation data encoding')
#validation_enc_data = PipelineLabelEncoder(columns = ['Gender','Married', 'Fraud_risk']).transform(validation_input_data)

Inside fit transform
Feature Gender
mapping {0: 0, 1: 1}
Feature Married
mapping {0: 0, 1: 1}
Feature Fraud_risk
mapping {0: 0, 1: 1}
-------------------------


In [15]:
#Create binary label dataset that can be used by bias mitigation algorithms
fraud_dataset = BinaryLabelDataset(favorable_label=favorable_label,
                                unfavorable_label=unfavorable_label,
                                df=preprocessed_data,
                                label_names=['Fraud_risk'],
                                protected_attribute_names=['Gender', 'Married'],
                                unprivileged_protected_attributes=unprivileged_groups)

In [16]:
display(Markdown("#### Training Data Details"))
print("shape of the training dataset", fraud_dataset.features.shape)
print("Training data favorable label", fraud_dataset.favorable_label)
print("Training data unfavorable label", fraud_dataset.unfavorable_label)
print("Training data protected attribute", fraud_dataset.protected_attribute_names)
print("Training data privileged protected attribute (1:Male and 0:Female)", 
      fraud_dataset.privileged_protected_attributes)
print("Training data unprivileged protected attribute (1:Male and 0:Female)",
      fraud_dataset.unprivileged_protected_attributes)

#### Training Data Details

shape of the training dataset (921, 3)
Training data favorable label 1.0
Training data unfavorable label 0.0
Training data protected attribute ['Gender', 'Married']
Training data privileged protected attribute (1:Male and 0:Female) [array([1.]), array([1.])]
Training data unprivileged protected attribute (1:Male and 0:Female) [array([0.]), array([0.])]


In [17]:
metric_orig_train = BinaryLabelDatasetMetric(fraud_dataset, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % 
      metric_orig_train.mean_difference())

Difference in mean outcomes between unprivileged and privileged groups = -0.372747


In [18]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)
RW.fit(fraud_dataset)
train_tf_dataset = RW.transform(fraud_dataset)

In [19]:
train_tf_dataset.labels

array([[0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

In [20]:
metric_orig_train = BinaryLabelDatasetMetric(train_tf_dataset, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)

print("Difference in mean outcomes between unprivileged and privileged groups = %f"
      % metric_orig_train.mean_difference())

Difference in mean outcomes between unprivileged and privileged groups = 0.000000


## There's no unfair advantage between unprivileged and privileged groups