# Adding a new dataset

In [1]:
# We'll suppress warnings because both altair and sklearn are
# emitting lots of them, and they're annoying in a demo setting.

import warnings
warnings.filterwarnings("ignore")

import altair as alt
# Ask Altair to produce output that works on Jupyter Notebook
alt.renderers.enable('notebook')

RendererRegistry.enable('notebook')

In [11]:
import fairness
import fairness.benchmark
import fairness.preprocess
import pandas as pd

In [3]:
from fairness.data.objects.Data import Data

#class Ricci(Data):
#
#    def __init__(self):
#        Data.__init__(self)
#        self.dataset_name = 'ricci'
#        # Class attribute will not be created until data_specific_processing is run.
#        self.class_attr = 'Class'
#        self.positive_class_val = 1
#        self.sensitive_attrs = ['Race'] 
#        self.privileged_class_names = ['W']
#        self.categorical_features = [ 'Position' ]
#        self.features_to_keep = [ 'Position', 'Oral', 'Written', 'Race', 'Combine' ]
#        self.missing_val_indicators = []
#
#    def data_specific_processing(self, dataframe):
#        dataframe['Class'] = dataframe.apply(passing_grade, axis=1)
#        return dataframe
#
#    def handle_missing_data(self, dataframe):
#        return dataframe

# See https://archive.ics.uci.edu/ml/datasets/heart+Disease

class HeartDisease(Data):
    def __init__(self):
        Data.__init__(self)
        self.dataset_name = 'heart-disease'
        self.class_attr = 'num'
        self.positive_class_val = 1
        self.sensitive_attrs = ['sex']
        self.privileged_class_names = [1]
        self.categorical_features = ['cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
        self.features_to_keep = [
            'age','sex','cp','trestbps','chol','fbs',
            'restecg','thalach','exang','oldpeak','slope','ca','thal','num']
        self.missing_val_indicators = []

    def get_raw_filename(self):
        return "./heart-disease-processed.csv"

fairness.add_dataset(HeartDisease())

In [4]:
fairness.preprocess.prepare_data(["heart-disease"])

--- Processing dataset: heart-disease ---

-------------------
Balance statistics:

Class:
num
0    160
1    137
dtype: int64

Sensitive Attribute:
sex
0     96
1    201
dtype: int64




Writing data to: data\preprocessed\heart-disease_original.csv
Writing data to: data\preprocessed\heart-disease_numerical.csv
Writing data to: data\preprocessed\heart-disease_numerical-binsensitive.csv
Writing data to: data\preprocessed\heart-disease_categorical-binsensitive.csv


In [17]:
fairness.benchmark.run(algorithm=["SVM", "DecisionTree", "ZafarFairness", "Feldman-DecisionTree", "Feldman-SVM"], 
                       dataset=["heart-disease"])

Datasets: '['heart-disease']'

Evaluating dataset:heart-disease
Sensitive attribute:sex
    Algorithm: SVM
       supported types: {'numerical', 'numerical-binsensitive'}
    Algorithm: DecisionTree
       supported types: {'numerical', 'numerical-binsensitive'}
    Algorithm: ZafarFairness
       supported types: {'numerical-binsensitive'}


Traceback (most recent call last):
  File "C:\Anaconda36\lib\site-packages\fairness\benchmark.py", line 67, in run
    all_sensitive_attributes, sensitive, supported_tag)
  File "C:\Anaconda36\lib\site-packages\fairness\benchmark.py", line 108, in run_eval_alg
    privileged_vals, positive_val)
  File "C:\Anaconda36\lib\site-packages\fairness\benchmark.py", line 146, in run_alg
    single_sensitive, privileged_vals, params)
  File "C:\Anaconda36\lib\site-packages\fairness\algorithms\zafar\ZafarAlgorithm.py", line 50, in run
    result = subprocess.run(cmd,
  File "C:\Anaconda36\lib\subprocess.py", line 403, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Anaconda36\lib\subprocess.py", line 709, in __init__
    restore_signals, start_new_session)
  File "C:\Anaconda36\lib\subprocess.py", line 997, in _execute_child
    startupinfo)
FileNotFoundError: [WinError 2] The system cannot find the file specified
Failed: [WinError 2] The system cannot find the file specified
T

    Algorithm: Feldman-SVM
       supported types: {'numerical', 'numerical-binsensitive'}
    Algorithm: Feldman-DecisionTree
       supported types: {'numerical', 'numerical-binsensitive'}
Results written to:
    C:\Users\xgitiaux\.fairness\results\heart-disease_sex_numerical.csv
    C:\Users\xgitiaux\.fairness\results\heart-disease_sex_numerical-binsensitive.csv


In [12]:
d = pd.read_csv('C:\\Users\\xgitiaux\.fairness\\results\\heart-disease_sex_numerical-binsensitive.csv', encoding='latin-1')
d.head(10)

Unnamed: 0,algorithm,params,run-id,0-FNR,0-FNR_over_1-FNR,0-FPR,0-FPR_over_1-FPR,0-TNR,0-TNR_over_1-TNR,0-TPR,...,sex-TPRRatio,sex-accuracy,sex-accuracyDiff,sex-accuracyRatio,sex-calibration+,sex-calibration+Diff,sex-calibration+Ratio,sex-calibration-,sex-calibration-Diff,sex-calibration-Ratio
0,SVM,,0,1.0,1.066667,0.0625,0.583333,0.9375,1.05,0.0,...,1.0,0.609615,1.319231,1.709402,0.2,0.6,1.0,0.367322,0.643735,0.346847
1,SVM,,1,1.0,1.085714,0.0,1.0,1.0,1.0,0.0,...,1.0,0.613615,1.256641,1.528837,1.0,1.0,1.0,0.398263,0.719603,0.479263
2,SVM,,2,1.0,1.052632,0.095238,2.952381,0.904762,0.934921,0.0,...,1.0,0.564638,1.227867,1.50558,0.333333,0.333333,1.0,0.414027,0.710407,0.481781
3,SVM,,3,1.0,1.0,0.0,0.0,1.0,1.043478,0.0,...,1.0,0.507415,1.235169,1.603261,0.5,2.0,1.0,0.489224,0.771552,0.621429
4,SVM,,4,1.0,1.085714,0.04,0.0,0.96,0.96,0.0,...,1.0,0.58371,1.244344,1.529412,0.5,1.0,1.0,0.418622,0.708211,0.483117
5,SVM,,5,1.0,1.133333,0.074074,2.222222,0.925926,0.957854,0.0,...,1.0,0.614955,1.198661,1.385281,0.4,0.2,1.0,0.375449,0.73395,0.476768
6,SVM,,6,1.0,1.051282,0.0,1.0,1.0,1.0,0.0,...,1.0,0.600332,1.347723,1.81535,1.0,1.0,1.0,0.408358,0.634897,0.382134
7,SVM,,7,1.0,1.04878,0.058824,0.0,0.941176,0.941176,0.0,...,1.0,0.52687,1.177028,1.403846,0.5,1.0,1.0,0.468732,0.782535,0.623415
8,SVM,,8,1.0,1.083333,0.0,1.0,1.0,1.0,0.0,...,1.0,0.693665,1.495023,2.109533,1.0,1.0,1.0,0.319734,0.478178,0.101307
9,SVM,,9,1.0,1.023256,0.0,1.0,1.0,1.0,0.0,...,1.0,0.521739,1.289855,1.769231,1.0,1.0,1.0,0.482843,0.70098,0.527132


In [6]:
hd = fairness.get_dataset_by_name("heart-disease").get_results_data_frame("sex", "numerical-binsensitive")
# So many measures!
hd.head(10)

Unnamed: 0,algorithm,params,run-id,0-FNR,0-FNR_over_1-FNR,0-FPR,0-FPR_over_1-FPR,0-TNR,0-TNR_over_1-TNR,0-TPR,...,sex-TPRRatio,sex-accuracy,sex-accuracyDiff,sex-accuracyRatio,sex-calibration+,sex-calibration+Diff,sex-calibration+Ratio,sex-calibration-,sex-calibration-Diff,sex-calibration-Ratio
0,SVM,,0,1.0,1.027027,0.0625,0.0,0.9375,0.9375,0.0,...,1.0,0.600649,1.162338,1.3125,0.5,1.0,1.0,0.386278,0.798872,0.586873
1,SVM,,1,1.0,1.058824,0.04,0.0,0.96,0.96,0.0,...,1.0,0.653623,1.292754,1.577143,0.5,1.0,1.0,0.339938,0.664951,0.339757
2,SVM,,2,1.0,1.151515,0.043478,0.0,0.956522,0.956522,0.0,...,1.0,0.627536,1.211594,1.405556,0.5,1.0,1.0,0.378502,0.725754,0.46813
3,SVM,,3,1.0,1.034483,0.037037,1.148148,0.962963,0.995062,0.0,...,1.0,0.596204,1.176014,1.34635,0.25,0.5,1.0,0.394411,0.805772,0.604846
4,SVM,,4,1.0,1.111111,0.074074,1.851852,0.925926,0.964506,0.0,...,1.0,0.583032,1.304525,1.706933,0.4,0.2,1.0,0.409375,0.61875,0.364583
5,SVM,,5,1.0,1.051282,0.052632,1.473684,0.947368,0.982456,0.0,...,1.0,0.510145,1.17971,1.427586,0.333333,0.333333,1.0,0.48511,0.788401,0.64191
6,SVM,,6,1.0,1.02381,0.0,0.0,1.0,1.038462,0.0,...,1.0,0.589409,1.407389,2.056194,0.75,1.5,2.0,0.412272,0.589249,0.334975
7,SVM,,7,1.0,1.105263,0.0,1.0,1.0,1.0,0.0,...,1.0,0.58,1.173333,1.351351,1.0,1.0,1.0,0.434272,0.798122,0.622807
8,SVM,,8,0.0,0.0,0.962963,0.990476,0.037037,1.333333,1.0,...,1.035714,0.340724,0.78914,0.527383,0.328283,0.767677,0.477273,0.25,0.5,1.0
9,SVM,,9,0.0,0.0,0.96,0.985946,0.04,1.52,1.0,...,1.032258,0.314778,0.715271,0.377155,0.29937,0.686975,0.313364,0.25,0.5,1.0


In [None]:
hd["sex_accuracy"] = hd["sex-accuracy"]

alt.Chart(hd).mark_point().encode(
    x='CV',
    y='sex_accuracy',
    color='algorithm'
)