In [125]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
import sqlite3
import pickle
import os
import sys

import warnings
import shap
warnings.filterwarnings("ignore")

import traceback
import sklearn

# SHAP

As we are not supposed to know what model is running in the backend, we can only use KernelSHAP in this example

The API server takes in a data file.

Our user provides us the data file. SHAP requires the background data (2nd parameter in KernelExplainer) to be an array format.

- KernelExplainer will convert the 2nd parameter into numpy.ndarray
- This means that if the user uploads a dataframe storing pandas df, then we have to do these
    - extract the feature names
    - save the given data, and reload the data again, transform it to pandas dataframe and use the feature names as the column
    
* means that `get_predictions` need to be customized based on the model API server...
    - or our tester need to build according to our API specs for the testing

## Via File

In [227]:
import requests
import json
import numpy 

data = pickle.load(open("../data/pickle_scikit_lr_compas_xtest_pkl.sav", "rb")).sample(10)
background = pickle.load(open("../data/pickle_scikit_lr_compas_xtrain_pkl.sav", "rb")).sample(10)
background.columns

api_url = "http://localhost:5000/predict"
column_name = background.columns # this is a hack also

def get_predictions(background):
    """SHAP requires the second parameter to be already in a data form"""
    print(background)
    # background is converted into numpy by SHAP
    location = "../data/pickle_scikit_lr_compas_xtrain_pkl.sav.tmp"
    f = open(location, "wb+")
    
    # convert to Pandas, cause that's what the API server can read
    
    if type(background) == numpy.ndarray:
        df = pd.DataFrame(background, columns=column_name)
        pickle.dump(df, f)
        f.close()
    else:
        # otherwise we can just use it directly
        pickle.dump(background, f)
        f.close()
    
    files = {'file': open(location, "rb")}
    
    r = requests.post(api_url, files=files)
    
    print(r)
    predictions = json.loads(r.content)
    
    
    return numpy.array(predictions["response"])

predictions = get_predictions(background)
print(predictions)

         id              name     first       last compas_screening_date  \
7153  10920   bradley ecklund   bradley    ecklund            2013-01-11   
4341   6621  andres penalopez    andres  penalopez            2013-09-22   
1363   2093        deanna dye    deanna        dye            2013-07-29   
4526   6916      carl fleming      carl    fleming            2013-02-23   
5592   8520    alain williams     alain   williams            2014-04-24   
262     384      calvin ellis    calvin      ellis            2014-04-01   
2433   3709      tesa edwards      tesa    edwards            2013-04-14   
1035   1590       mark harden      mark     harden            2014-03-27   
3247   4964     lemy williams      lemy   williams            2014-04-01   
4130   6301    epifania roman  epifania      roman            2013-08-24   

         sex         dob  age       age_cat              race  ...  \
7153    Male  1981-06-23   34       25 - 45         Caucasian  ...   
4341    Male  1988-06-1

In [228]:
explainer = shap.KernelExplainer(get_predictions, background)
print(explainer.shap_values(data))

[[10920 'bradley ecklund' 'bradley' 'ecklund' '2013-01-11' 'Male'
  '1981-06-23' 34 '25 - 45' 'Caucasian' 0 5 0 0 4 0.0
  '2013-01-11 01:39:37' '2013-01-12 03:08:33' '13000503CF10A'
  '2013-01-10' nan 1.0 'F' 'Pos Cannabis W/Intent Sel/Del' 0 nan nan nan
  nan nan nan nan nan 0 nan nan nan nan 'Risk of Recidivism' 5 'Medium'
  '2013-01-11' 'Risk of Violence' 5 'Medium' '2013-01-11' '2015-07-23'
  '2015-07-23' 4 1 923 0]
 [6621 'andres penalopez' 'andres' 'penalopez' '2013-09-22' 'Male'
  '1988-06-15' 27 '25 - 45' 'Hispanic' 0 2 0 0 0 0.0
  '2013-09-22 03:53:21' '2013-09-22 08:00:54' '13013359CF10A'
  '2013-09-21' nan 1.0 'F' 'Grand Theft in the 3rd Degree' 1
  '14014299CF10A' '(F3)' 0.0 '2014-10-23'
  'Use of Anti-Shoplifting Device' '2014-10-23' '2014-10-24' nan 0 nan
  nan nan nan 'Risk of Recidivism' 2 'Low' '2013-09-22'
  'Risk of Violence' 3 'Low' '2013-09-22' '2014-10-23' '2014-10-24' 0 0
  396 1]
 [2093 'deanna dye' 'deanna' 'dye' '2013-07-29' 'Female' '1983-01-02' 33
  '25 - 45

  0%|                                         | 0/10 [00:00<?, ?it/s]

[[7015 'daniel rosario' 'daniel' 'rosario' '2013-03-09' 'Male'
  '1992-12-08' 23 'Less than 25' 'Caucasian' 0 2 0 0 0 0.0
  '2013-03-09 05:57:11' '2013-03-10 07:02:54' '13003495CF10A'
  '2013-03-09' nan 0.0 'F' 'Poss Unlaw Issue Id' 0 nan nan nan nan nan
  nan nan nan 0 nan nan nan nan 'Risk of Recidivism' 2 'Low' '2013-03-09'
  'Risk of Violence' 5 'Medium' '2013-03-09' '2013-03-09' '2013-03-10' 0
  1 1119 0]]
<Response [200]>
[[7015 'bradley ecklund' 'bradley' ... 1 923 0]
 [7015 'andres penalopez' 'andres' ... 0 396 1]
 [7015 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [1590 'mark harden' 'mark' ... 0 256 0]
 [4964 'lemy williams' 'lemy' ... 0 9 0]
 [6301 'epifania roman' 'epifania' ... 0 500 0]]


 10%|███▎                             | 1/10 [00:00<00:03,  2.87it/s]

<Response [200]>
[[3538 'marvon jemmott' 'marvon' 'jemmott' '2013-12-14' 'Male'
  '1995-01-17' 21 'Less than 25' 'African-American' 0 2 0 0 1 -1.0
  '2013-12-13 07:02:10' '2013-12-14 01:30:41' '13017261CF10A'
  '2013-12-13' nan 1.0 'F' 'Grand Theft in the 3rd Degree' 0 nan nan nan
  nan nan nan nan nan 0 nan nan nan nan 'Risk of Recidivism' 2 'Low'
  '2013-12-14' 'Risk of Violence' 5 'Medium' '2013-12-14' '2014-11-12'
  '2014-12-24' 1 0 333 0]]
<Response [200]>
[[3538 'bradley ecklund' 'bradley' ... 1 923 0]
 [3538 'andres penalopez' 'andres' ... 0 396 1]
 [3538 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [3538 'marvon jemmott' 'marvon' ... 0 333 0]
 [3538 'marvon jemmott' 'marvon' ... 0 333 0]
 [3538 'marvon jemmott' 'marvon' ... 0 333 0]]
<Response [200]>


 20%|██████▌                          | 2/10 [00:00<00:02,  2.75it/s]

[[8634 'brian bradley' 'brian' 'bradley' '2013-09-30' 'Male' '1971-05-11'
  44 '25 - 45' 'African-American' 0 1 0 0 0 -1.0 '2013-09-29 07:10:37'
  '2013-09-30 08:52:46' '13013664CF10A' '2013-09-29' nan 1.0 'F'
  'Felony Driving While Lic Suspd' 0 nan nan nan nan nan nan nan nan 0
  nan nan nan nan 'Risk of Recidivism' 1 'Low' '2013-09-30'
  'Risk of Violence' 1 'Low' '2013-09-30' '2013-09-29' '2013-09-30' 0 0
  914 0]]
<Response [200]>
[[8634 'bradley ecklund' 'bradley' ... 1 923 0]
 [8634 'andres penalopez' 'andres' ... 0 396 1]
 [8634 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [1590 'mark harden' 'mark' ... 0 914 1]
 [4964 'lemy williams' 'lemy' ... 0 914 1]
 [6301 'epifania roman' 'epifania' ... 0 914 1]]


 30%|█████████▉                       | 3/10 [00:01<00:02,  2.66it/s]

<Response [200]>
[[3697 'adolfo casco' 'adolfo' 'casco' '2013-01-27' 'Male' '1962-01-06'
  54 'Greater than 45' 'Caucasian' 0 1 0 0 0 -1.0 '2013-01-26 07:42:36'
  '2013-01-27 01:56:31' '13001884MM10A' '2013-01-26' nan 1.0 'M'
  'Battery' 0 nan nan nan nan nan nan nan nan 0 nan nan nan nan
  'Risk of Recidivism' 1 'Low' '2013-01-27' 'Risk of Violence' 2 'Low'
  '2013-01-27' '2013-01-26' '2013-01-27' 0 0 1160 0]]
<Response [200]>
[[3697 'bradley ecklund' 'bradley' ... 1 923 0]
 [3697 'andres penalopez' 'andres' ... 0 396 1]
 [3697 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [3697 'adolfo casco' 'mark' ... 0 256 0]
 [3697 'adolfo casco' 'lemy' ... 0 9 0]
 [3697 'adolfo casco' 'epifania' ... 0 500 0]]
<Response [200]>


 40%|█████████████▏                   | 4/10 [00:01<00:02,  2.64it/s]

[[6383 'william walker' 'william' 'walker' '2013-12-23' 'Male'
  '1960-05-10' 55 'Greater than 45' 'Caucasian' 0 1 0 0 0 -1.0
  '2013-12-22 12:42:17' '2013-12-22 01:00:00' '13023553MM10A'
  '2013-12-21' nan 2.0 'M' 'Battery' 1 '14009604CF10A' '(F3)' 0.0
  '2014-07-14' 'Felony Battery (Dom Strang)' '2014-07-14' '2014-07-17'
  nan 1 '14009604CF10A' '(F3)' '2014-07-14' 'Felony Battery (Dom Strang)'
  'Risk of Recidivism' 1 'Low' '2013-12-23' 'Risk of Violence' 1 'Low'
  '2013-12-23' '2014-07-14' '2014-07-17' 0 0 203 1]]
<Response [200]>
[[6383 'bradley ecklund' 'bradley' ... 1 923 0]
 [6383 'andres penalopez' 'andres' ... 0 396 1]
 [6383 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [1590 'mark harden' 'mark' ... 0 256 1]
 [4964 'lemy williams' 'lemy' ... 0 9 1]
 [6301 'epifania roman' 'epifania' ... 0 500 1]]


 50%|████████████████▌                | 5/10 [00:01<00:01,  2.74it/s]

<Response [200]>
[[1150 'wanda daniels' 'wanda' 'daniels' '2013-10-09' 'Female'
  '1968-12-18' 47 'Greater than 45' 'African-American' 0 10 0 0 19 -1.0
  '2013-10-08 12:12:30' '2013-11-08 08:59:42' '13014107CF10A'
  '2013-10-08' nan 1.0 'F' 'Possession of Cocaine' 1 '15012442CF10A'
  '(F3)' 0.0 '2015-09-25' 'Possession of Cocaine' '2015-09-25'
  '2015-10-30' nan 0 nan nan nan nan 'Risk of Recidivism' 10 'High'
  '2013-10-09' 'Risk of Violence' 2 'Low' '2013-10-09' '2014-01-28'
  '2014-07-02' 19 30 111 0]]
<Response [200]>
[[1150 'bradley ecklund' 'bradley' ... 1 923 0]
 [1150 'andres penalopez' 'andres' ... 0 396 1]
 [1150 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [1590 'wanda daniels' 'wanda' ... 0 111 1]
 [4964 'wanda daniels' 'wanda' ... 0 111 1]
 [6301 'wanda daniels' 'wanda' ... 0 111 1]]
<Response [200]>


 60%|███████████████████▊             | 6/10 [00:02<00:01,  2.68it/s]

[[6359 'terry bradley' 'terry' 'bradley' '2013-12-18' 'Male' '1961-08-18'
  54 'Greater than 45' 'African-American' 0 6 0 0 3 -1.0
  '2013-12-17 02:05:51' '2013-12-19 09:10:18' '13016923CF10A' nan
  '2013-12-17' 1.0 'F' 'arrest case no charge' 1 '14006976TC10A' '(M2)'
  1.0 '2014-02-23' 'Susp Drivers Lic 1st Offense' '2014-02-24'
  '2014-02-28' nan 0 nan nan nan nan 'Risk of Recidivism' 6 'Medium'
  '2013-12-18' 'Risk of Violence' 3 'Low' '2013-12-18' '2013-12-17'
  '2013-12-19' 3 1 67 1]]
<Response [200]>
[[6359 'bradley ecklund' 'bradley' ... 1 923 0]
 [6359 'andres penalopez' 'andres' ... 0 396 1]
 [6359 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [6359 'terry bradley' 'terry' ... 1 67 1]
 [6359 'terry bradley' 'terry' ... 1 67 1]
 [6359 'terry bradley' 'terry' ... 1 67 1]]


 70%|███████████████████████          | 7/10 [00:02<00:01,  2.70it/s]

<Response [200]>
[[2464 'ronald shelton' 'ronald' 'shelton' '2013-05-29' 'Male'
  '1958-06-02' 57 'Greater than 45' 'African-American' 0 9 0 0 12 -1.0
  '2013-05-28 08:29:39' '2013-07-18 03:49:07' '13007619CF10A'
  '2013-05-28' nan 1.0 'F' 'Possession of Cocaine' 0 nan nan nan nan nan
  nan nan nan 0 nan nan nan nan 'Risk of Recidivism' 9 'High'
  '2013-05-29' 'Risk of Violence' 6 'Medium' '2013-05-29' '2013-05-28'
  '2013-07-18' 12 50 1038 0]]
<Response [200]>
[[2464 'bradley ecklund' 'bradley' ... 1 923 0]
 [2464 'andres penalopez' 'andres' ... 0 396 1]
 [2464 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [1590 'ronald shelton' 'mark' ... 0 256 1]
 [4964 'ronald shelton' 'lemy' ... 0 9 1]
 [6301 'ronald shelton' 'epifania' ... 0 500 1]]
<Response [200]>


 80%|██████████████████████████▍      | 8/10 [00:02<00:00,  2.67it/s]

[[8309 'andrae cray' 'andrae' 'cray' '2013-10-23' 'Male' '1990-03-08' 26
  '25 - 45' 'African-American' 0 5 1 0 6 -8.0 '2013-10-15 07:14:52'
  '2013-10-17 05:10:33' '12015557TC10A' nan '2013-10-15' 8.0 'M'
  'arrest case no charge' 0 nan nan nan nan nan nan nan nan 0 nan nan nan
  nan 'Risk of Recidivism' 5 'Medium' '2013-10-23' 'Risk of Violence' 4
  'Low' '2013-10-23' '2015-07-17' '2015-08-01' 6 0 632 0]]
<Response [200]>
[[8309 'bradley ecklund' 'bradley' ... 1 923 0]
 [8309 'andres penalopez' 'andres' ... 0 396 1]
 [8309 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [8309 'andrae cray' 'andrae' ... 0 256 0]
 [8309 'andrae cray' 'andrae' ... 0 9 0]
 [8309 'andrae cray' 'andrae' ... 0 500 0]]


 90%|█████████████████████████████▋   | 9/10 [00:03<00:00,  2.70it/s]

<Response [200]>
[[9784 'sheldon jones' 'sheldon' 'jones' '2013-04-22' 'Male' '1976-05-18'
  39 '25 - 45' 'African-American' 0 2 0 0 2 -1.0 '2013-04-21 04:21:02'
  '2013-04-22 06:59:21' '13007676MM10A' '2013-04-21' nan 1.0 'M'
  'Driving Under The Influence' 0 nan nan nan nan nan nan nan nan 0 nan
  nan nan nan 'Risk of Recidivism' 2 'Low' '2013-04-22'
  'Risk of Violence' 1 'Low' '2013-04-22' '2013-04-21' '2013-04-22' 2 0
  1075 0]]
<Response [200]>
[[9784 'bradley ecklund' 'bradley' ... 1 923 0]
 [9784 'andres penalopez' 'andres' ... 0 396 1]
 [9784 'deanna dye' 'deanna' ... 0 47 0]
 ...
 [1590 'mark harden' 'mark' ... 0 1075 0]
 [4964 'lemy williams' 'lemy' ... 0 1075 0]
 [6301 'epifania roman' 'epifania' ... 0 1075 0]]
<Response [200]>


100%|████████████████████████████████| 10/10 [00:03<00:00,  2.69it/s]

[[ 1.27356538e-02 -7.43594984e-03  0.00000000e+00  1.54058109e-02
  -5.35473679e-03  1.13654280e-01  0.00000000e+00 -1.19344646e-02
   5.86539007e-01 -2.47427403e-01  0.00000000e+00  7.92084593e-03
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -9.05862607e-03  4.70622472e-03  0.00000000e+00
  -4.19535758e-03  9.47130490e-03  1.59155460e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -3.76068746e-03  0.00000000e+00
   8.78681025e-03  0.00000000e+00 -6.18149507e-03 -1.25023695e-02
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -3.94267913e-03  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  2.47791001e-03 -1.00691002e-02
   7.04014246e-03 -6.03058055e-03  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  7.52670700e-02  0.00000000e+00  0.00000000e+00
   2.7799




## Via processed data using single datapoint

In [270]:
import requests
import json
import numpy 
from json import JSONEncoder

api_url = "http://localhost:5000/predict_all"

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

def get_one_prediction(data):
    """SHAP requires the second parameter to be already in a data form"""
    # convert the data to payload that can be understood by the API
    if type(data) != numpy.ndarray:
        data = data.to_numpy()
    
    data = json.dumps(data, cls=NumpyArrayEncoder)
    r = requests.post(api_url, json={"input": data})
    predictions = json.loads(r.content)
    print(predictions)
    return numpy.array(predictions["response"])

# assuming the test upload both data and background to us
background = pickle.load(open("../data/pickle_pandas_tabular_compas_training.sav", "rb")).drop("two_year_recid", axis=1).sample(1)
data = pickle.load(open("../data/pickle_pandas_tabular_compas_testing.sav", "rb")).drop("two_year_recid", axis=1).sample(10)

explainer = shap.KernelExplainer(get_one_prediction, background)
explainer.shap_values(data)

{'response': [0]}


100%|███████████████████████████████| 10/10 [00:00<00:00, 163.91it/s]

{'response': [1]}
{'response': [0, 0, 0, 0, 0, 0]}
{'response': [0]}
{'response': [0, 0]}
{'response': [0]}
{'response': [0]}
{'response': [0, 0]}
{'response': [1]}
{'response': [0, 1, 1, 0, 0, 1]}
{'response': [1]}
{'response': [0, 0, 0, 0, 0, 0]}
{'response': [0]}
{'response': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'response': [0]}
{'response': [0]}
{'response': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'response': [0]}
{'response': [0, 0, 0, 0, 0, 0]}





array([[ 0.33333333,  0.        ,  0.        ,  0.33333333,  0.33333333],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.33333333,  0.        ,  0.        ,  0.33333333,  0.33333333],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.08333333,  0.        , -0.25      ,  0.08333333,  0.08333333],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

## Via processed data using multiple datapoints

In [184]:
model = pickle.load(open("../model/pickle_scikit_lr_compas.sav", "rb"))
predictions = model.predict(background)
print(predictions)
e = shap.KernelExplainer(model.predict_proba, background)
e.shap_values(data)

In [274]:
import requests
import json
import numpy 
from json import JSONEncoder

api_url = "http://localhost:5000/predict_all"

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

def get_multiple_prediction(data):
    """SHAP requires the second parameter to be already in a data form"""
    # convert the data to payload that can be understood by the API
    if type(data) != numpy.ndarray:
        data = data.to_numpy()
    
    data = json.dumps(data, cls=NumpyArrayEncoder)
    #print({"input": [[1, 1, 1, 1, 1]]})
    r = requests.post(api_url, json={"input": data})
    predictions = json.loads(r.content)
    #print(predictions)
    return numpy.array(predictions["response"])

# assuming the test upload both data and background to us
background = pickle.load(open("../data/pickle_pandas_tabular_compas_training.sav", "rb")).drop("two_year_recid", axis=1).sample(10)
data = pickle.load(open("../data/pickle_pandas_tabular_compas_testing.sav", "rb")).drop("two_year_recid", axis=1).sample(5)

explainer = shap.KernelExplainer(get_multiple_prediction, background)
explainer.shap_values(data)

100%|█████████████████████████████████| 5/5 [00:00<00:00, 126.79it/s]


array([[ 0.18333333,  0.        ,  0.01666667, -0.21666667,  0.01666667],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.16666667,  0.        ,  0.06666667,  0.03333333, -0.26666667],
       [-0.03333333, -0.03333333,  0.        ,  0.06666667,  0.        ],
       [-0.025     ,  0.05833333,  0.125     ,  0.73333333,  0.10833333]])

# AIF360

This is straightforward, we can the prediction, we have the ground truth.. so we can compare

In [105]:
from aif360.datasets import CompasDataset, BinaryLabelDataset
from aif360.datasets.multiclass_label_dataset import MulticlassLabelDataset
from aif360.metrics import DatasetMetric
from aif360.algorithms.preprocessing import Reweighing

# Fairness metrics
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric

In [104]:
y_test = pickle.load(open("../data/pickle_scikit_lr_compas_ytest_pkl.sav", "rb"))
x_test = pickle.load(open("../data/pickle_scikit_lr_compas_xtest_pkl.sav", "rb"))
y_train = pickle.load(open("../data/pickle_scikit_lr_compas_ytrain_pkl.sav", "rb"))
x_train = pickle.load(open("../data/pickle_scikit_lr_compas_xtrain_pkl.sav", "rb"))

def get_prediction_fairness(location):
    files = {'file': open(location, "rb")}
    r = requests.post(api_url, files=files)
    predictions = json.loads(r.content)
    return numpy.array(predictions["response"])

location = "../data/pickle_scikit_lr_compas_xtest_pkl.sav"
predictions = get_prediction_fairness(location)

## Issue using AIF360 without Preprocessing Pipeline


*AIF360 requires the data to be processed before it can be placed in the binarylabeldataset.

*we are not expecting to process the data if we are moving to call the model via API, hence, we might not not be able to use AIF360 directly

In [279]:
ground_truth = "two_year_recid"

x_train[ground_truth] = y_train[ground_truth]
sensitive_features = ["sex"]
print(x_train.head(1))
original_data = BinaryLabelDataset(df=x_train,
                                    label_names=[ground_truth],  # ground truth
                                    protected_attribute_names=sensitive_features)

        id               name      first     last compas_screening_date  \
5396  8223  christina deperna  christina  deperna            2013-08-04   

         sex         dob  age  age_cat       race  ...  v_decile_score  \
5396  Female  1984-10-07   31  25 - 45  Caucasian  ...               5   

      v_score_text  v_screening_date  in_custody  out_custody  priors_count.1  \
5396        Medium        2013-08-04  2013-08-04   2013-08-05              14   

     start  end event two_year_recid  
5396     1  971     0              0  

[1 rows x 53 columns]


ValueError: Input DataFrames cannot contain NA values.

# ART

In [280]:
from art.attacks.evasion import ZooAttack
from art.estimators.classification import SklearnClassifier, XGBoostClassifier, BlackBoxClassifier
from art.utils import to_categorical
from sklearn.metrics import accuracy_score

In [286]:
import requests
import json
import numpy 
from json import JSONEncoder

api_url = "http://localhost:5000/predict_all"

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

def get_multiple_prediction(data):
    """SHAP requires the second parameter to be already in a data form"""
    # convert the data to payload that can be understood by the API
    if type(data) != numpy.ndarray:
        data = data.to_numpy()
    
    data = json.dumps(data, cls=NumpyArrayEncoder)
    #print({"input": [[1, 1, 1, 1, 1]]})
    r = requests.post(api_url, json={"input": data})
    predictions = json.loads(r.content)
    
    return to_categorical(numpy.array(predictions["response"]), nb_classes=2)

# assuming the test upload both data and background to us
background = pickle.load(open("../data/pickle_pandas_tabular_compas_training.sav", "rb")).drop("two_year_recid", axis=1).sample(10)
data = pickle.load(open("../data/pickle_pandas_tabular_compas_testing.sav", "rb")).drop("two_year_recid", axis=1).sample(5)

classifier = BlackBoxClassifier(predict_fn=get_multiple_prediction, nb_classes=2,
                                                input_shape=data.to_numpy()[0].shape)

zoo = ZooAttack(classifier=classifier, confidence=0.0, targeted=False, learning_rate=1e-1, max_iter=20,
                            binary_search_steps=10, initial_const=1e-3, abort_early=True, use_resize=False,
                            use_importance=False, nb_parallel=1, batch_size=1, variable_h=0.2)

adv_samples = zoo.generate(data.to_numpy()) # this requires numpy data
adv_samples

ZOO: 100%|█████████████████████████████| 5/5 [00:00<00:00,  5.36it/s]


array([[0, 1, 2, 1, 1],
       [0, 0, 2, 2, 1],
       [0, 0, 0, 4, 1],
       [0, 0, 2, 0, 0],
       [2, 1, 0, 1, 1]])