In [1]:
import tensorflow_model_analysis as tfma
import tensorflow_data_validation as tfdv

from google.protobuf import text_format

from ucimlrepo import fetch_ucirepo
import os
import numpy as np
import pandas as pd

In [2]:
raw_df=fetch_ucirepo(id=2).data.original

In [3]:
df=raw_df.copy()
df["income"]=df["income"].replace(['<=50K.','>50K.'],['<=50K','>50K'])
df=df.replace(['?'],[np.nan])

In [4]:
stats = tfdv.generate_statistics_from_dataframe(df)
tfdv.visualize_statistics(stats)

From tensorflow data validation, we can see the more than 90% of feature **capital-gain** and **capital-loss** are 0. It can be safely assumed that they are redundant features and can be ignored during training process 

In [5]:
df=df.drop(["capital-gain","capital-loss"],axis=1)

In [6]:
numeric_feautre_names=df.select_dtypes(include=["number"]).columns
categorical_feature_names=df.select_dtypes(include=["object_"]).columns

In [7]:
pre_df=df.copy()

def min_max_noramlize(column):
    return (column-column.min())/(column.max()-column.min())

for nc in numeric_feautre_names:
    pre_df[nc]=min_max_noramlize(pre_df[nc])    

In [8]:
for cc in categorical_feature_names:
    pre_df[cc]=pd.factorize(pre_df[cc])[0]

In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(pre_df, test_size=0.2)

In [10]:
from sklearn.ensemble import RandomForestClassifier

classifer=RandomForestClassifier()
classifer.fit(train.loc[:,train.columns!="income"],train["income"])

RandomForestClassifier()

In [11]:
pred=pd.Series(classifer.predict(test.loc[:,test.columns!="income"]),index=test.index,name="pred")

In [12]:
result=pd.concat([df.loc[test.index,["sex"]],test["income"],pred],axis=1)
result.columns=["gender","label","prediction"]

In [13]:
result

Unnamed: 0,gender,label,prediction
18056,Male,0,0
35318,Male,1,1
26925,Male,0,0
40178,Female,0,0
30326,Male,0,1
...,...,...,...
36348,Male,0,0
17083,Female,0,0
26175,Male,0,0
16425,Female,0,0


In [14]:
output=os.path.abspath("bin")

output

'c:\\Users\\15783\\source\\repos\\Ethical-Framework\\Fairness Indicator\\bin'

In [15]:
file=open("eval.config","r")
config=file.read()
file.close()

In [16]:
import shutil


# Specify Fairness Indicators in eval_config.
eval_config = text_format.Parse(config, tfma.EvalConfig())

# Run TensorFlow Model Analysis.
try:
  shutil.rmtree(output)
except OSError:
  pass
eval_result = tfma.analyze_raw_data(
  data=result,
  eval_config=eval_config,
  output_path=output)






Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [17]:
tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_result)

FairnessIndicatorViewer(slicingMetrics=[{'sliceValue': 'Male', 'slice': 'gender:Male', 'metrics': {'accuracy':…

In [18]:
column=[]
rows=[]
index=[]
for sn in eval_result.get_slice_names():
    if "Male" in str(sn):
        index.append("male")
    else:
        index.append("female")
    row=[]
    for key,item in eval_result.get_metrics_for_slice(sn).items():
        if key=="example_count":
            continue
        key=key.replace("fairness_indicators_metrics/","")
        key=key.replace("@0.5","")
        if key not in column:
            column.append(key)
        row.append(item["doubleValue"])
    rows.append(row)

FI_result=pd.DataFrame(rows,columns=column,index=index)

In [19]:
FI_result

Unnamed: 0,accuracy,false_positive_rate,false_negative_rate,true_positive_rate,true_negative_rate,positive_rate,negative_rate,false_discovery_rate,false_omission_rate,precision,recall
male,0.785281,0.129501,0.413131,0.586869,0.870499,0.26692,0.73308,0.339397,0.169323,0.660603,0.586869
female,0.910349,0.028612,0.535248,0.464752,0.971388,0.081158,0.918842,0.310078,0.070181,0.689922,0.464752


In [20]:
FI_difference=FI_result.loc["male"]-FI_result.loc["female"]

final_result=pd.DataFrame([FI_difference],columns=column,index=["Random Forest"])
final_result.columns.name="Adult"

In [21]:
final_result

Adult,accuracy,false_positive_rate,false_negative_rate,true_positive_rate,true_negative_rate,positive_rate,negative_rate,false_discovery_rate,false_omission_rate,precision,recall
Random Forest,-0.125068,0.100889,-0.122117,0.122117,-0.100889,0.185762,-0.185762,0.02932,0.099142,-0.02932,0.122117
