# XDGBoost and Adult Dataset

In [1]:
from math import floor

import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.utils import shuffle

# Build Pandas Dataframe
df = pd.read_csv("data/adult.csv").dropna()
df = shuffle(df)

# encode categorical variables with ints
cont_vars = ["age", "fnlwgt", "capital-loss", "capital-gain", "hours-per-week", "education-num"]
cat_vars = ["education", "workclass", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
df[cat_vars + ["salary"]] = df[cat_vars + ["salary"]].apply(LabelEncoder().fit_transform)
# display the dataframe
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
8599,62,7,159699,9,13,2,1,0,4,1,0,0,38,39,0
18408,47,5,181130,14,15,2,10,0,4,1,99999,0,50,39,1
23412,41,4,308550,11,9,0,3,4,4,0,0,0,60,39,0
2259,38,2,123983,9,13,4,4,4,1,1,0,1741,40,40,0
23747,19,4,35865,15,10,4,8,1,4,0,0,0,30,39,0


In [2]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
8599,62,7,159699,9,13,2,1,0,4,1,0,0,38,39,0
18408,47,5,181130,14,15,2,10,0,4,1,99999,0,50,39,1
23412,41,4,308550,11,9,0,3,4,4,0,0,0,60,39,0
2259,38,2,123983,9,13,4,4,4,1,1,0,1741,40,40,0
23747,19,4,35865,15,10,4,8,1,4,0,0,0,30,39,0


## Build numpy Arrays from Dataset

In [3]:
feature_names = list(df.columns.values)[:-1]
categorical_features = []
for c in cat_vars:
    categorical_features.append(feature_names.index(c))

# Build numpy array
feature_names = list(df.columns.values)[:-1]
xcols = df[feature_names].to_numpy(dtype="double")
ycols = df["salary"].to_numpy(dtype="double")

split_ind = floor(len(xcols) * 0.85)
xtrain, xtest = xcols[:split_ind], xcols[split_ind:]
ytrain, ytest = ycols[:split_ind], ycols[split_ind:]

## Train XDGBoost

In [4]:
# train xdgboost + print accuracy
model = XGBClassifier()
model.fit(xtrain, ytrain)

def predict_proba(arr):
    """
    predict_proba will return an array probabilities for each class
    NOTE: For kiwi (and unlike LIME) you must use 64-bit floats.
    """
    predictions = model.predict_proba(arr)
    return predictions.astype("float64")

metrics.accuracy_score(ytest, model.predict(xtest))





0.8679631525076765

## Compare Kiwi and Lime

In this cell we compare kiwi and lime on a random record so we can visually inspect the output. For some peace of mind we're getting the same answers.

In [5]:
def compare_exps(exp_li, exp_kw, label, max=-1):
    """
    compare_exps will compare a kiwi explanation
    and a lime explanation.
    """
    loop_count = 0
    for (e_li, e_kw) in zip(exp_li.as_list(label), exp_kw.as_list(label)):
        print(f"kiwi = {e_kw}")
        print(f"lime = {e_li}")
        print(flush=True)
        
        if max != -1 and loop_count == max:
            break
        
        loop_count += 1

In [6]:
from time import monotonic
from random import randint

import lime.lime_tabular
import kiwi

ind = randint(0, len(xtest))

explainer_kw = kiwi.KiwiTabularExplainer(xtrain,
                                           categorical_features=categorical_features,
                                           feature_names=feature_names, num_samples=20000)

explainer_li = lime.lime_tabular.LimeTabularExplainer(xtrain, 
                                                   feature_names=feature_names,
                                                   categorical_features=categorical_features,
                                                   discretize_continuous=True)
num_features = len(cat_vars) + len(cont_vars)

start = monotonic()
exp_kw = explainer_kw.explain_instance(xtest[ind],
                                       predict_proba)

kiwi_done = monotonic() - start
exp_li = explainer_li.explain_instance(xtest[ind],
                                predict_proba,
                                num_features=num_features,
                                num_samples=20000,
                                top_labels=2)
lime_done = monotonic() - (kiwi_done + start)

# Times
print(f"kiwi latency: {kiwi_done}")
print(f"lime latency: {lime_done}")
print(flush=True)

# Print comparison
compare_exps(exp_li, exp_kw, int(ytest[ind]))

kiwi latency: 0.034027103999505925
lime latency: 4.9670322370002395

kiwi = ('capital-gain <= 0', 0.7088681206795285)
lime = ('capital-gain <= 0.00', 0.692566375347684)

kiwi = ('education-num <= 9', 0.0957105377679561)
lime = ('hours-per-week <= 40.00', 0.10012166323969529)

kiwi = ('hours-per-week <= 40', 0.09231616095601604)
lime = ('education-num <= 9.00', 0.09432272049236153)

kiwi = ('capital-loss <= 0', 0.05787873283837122)
lime = ('marital-status=2', -0.077977344087703)

kiwi = ('relationship=0', -0.05212581737687029)
lime = ('capital-loss <= 0.00', 0.06559575129501964)

kiwi = ('race=4', 0.04793954251913317)
lime = ('relationship=0', -0.05567998359979513)

kiwi = ('marital-status=2', 0.037501274610936906)
lime = ('occupation=0', 0.03597163859453822)

kiwi = ('sex=1', 0.02906539823952209)
lime = ('sex=1', -0.02809905296019911)

kiwi = ('occupation=0', 0.028299790474182036)
lime = ('race=4', -0.018422082751426528)

kiwi = ('education=11', 0.01958247520760871)
lime = ('native-cou

## Performance Benchmarking

### explain_instance_many method

The `explain_instance_many` is a good solution up to a few thousand records.

In [7]:
start = monotonic()
# In explain_instance_many we can batch in a few hundred records at a time
batch_size = 100

all_test_exps = []
for n in range(0, len(xtest), batch_size):
    # This call will return batch_size explanation objects
    es = explainer_kw.explain_instance_many(xtest[n:n+batch_size], predict_proba)
    all_test_exps.extend(es)

print(f"num records = {len(xtest)}")
print(f"records per second = {len(xtest) / (monotonic() - start)}")

num records = 4885
records per second = 40.59772427161543


In [8]:
# Compare with a random record from xtest
# computed with the lime explainer for peace of mind
ind = randint(0, len(xtest))
exp_li = explainer_li.explain_instance(xtest[ind],
                                       predict_proba,
                                       num_features=num_features,
                                       num_samples=20000,
                                       top_labels=2)

compare_exps(exp_li, all_test_exps[ind], int(ytest[ind]), max=5)

kiwi = ('capital-gain <= 0', -0.7025800507544331)
lime = ('capital-gain <= 0.00', -0.6922388590920431)

kiwi = ('education-num <= 9', -0.09941981209077803)
lime = ('hours-per-week > 45.00', 0.09816447171145021)

kiwi = ('hours-per-week > 45', 0.09599012522326464)
lime = ('education-num <= 9.00', -0.09663861578441421)

kiwi = ('capital-loss <= 0', -0.08371257177605297)
lime = ('marital-status=2', 0.07346074074858153)

kiwi = ('relationship=0', 0.056972352842339744)
lime = ('capital-loss <= 0.00', -0.059451892301810895)

kiwi = ('37 < age <= 48', 0.05051763010960833)
lime = ('relationship=0', 0.058227956663710584)



### explain_instance_iter method

The `explain_instance_iter` method is heavy to start and requires lots of memory to run.
This should only be used if you have tens of thousands of records to compute. If your requirement
is only hundreds of records the author recomends using the `explain_instance_many` method above.

In [21]:
explainer_kw = kiwi.KiwiTabularExplainer(xtrain,
                                           categorical_features=categorical_features,
                                           feature_names=feature_names,
                                           sample_background_thread=False)

In [19]:
# Compute exp_li for a random sample so we can compare it to the output below.

from random import randint
ind = randint(0, len(xcols))
exp_li = explainer_li.explain_instance(xcols[ind],
                                 predict_proba,
                                 num_features=num_features,
                                 num_samples=5000,
                                 top_labels=2)


In [22]:
exp_kw = None

start = monotonic()

# We should use explain_instance_iter as a lazy iterator over the dataset
# Inside the loop, put whatever logic you wish to perform per-record
for n, e in enumerate(explainer_kw.explain_instance_iter(xcols, predict_proba, num_threads=2)):
    # Keep a rolling output of our performance
    if n != 0 and n % 2000 == 0:
        print(f"records per second for {n / 1000}K records = {n / (monotonic() - start)}", flush=True)

    # save one record for a comparison
    if n == ind:
        exp_kw = e

total_time = monotonic() - start
print(f"num records {len(xcols)} in {total_time}")
print(f"records per second = {len(xcols) / total_time}")
print()

# Compare the single exp
compare_exps(exp_li, exp_kw, int(ycols[ind]), max=7)

records per second for 2000 records = 143.05656991813999
records per second for 4000 records = 170.11685761300768
records per second for 6000 records = 179.52791260212658
records per second for 8000 records = 183.35332234025805
records per second for 10000 records = 187.5692983998974
records per second for 12000 records = 191.0940649016607
records per second for 14000 records = 193.4777989178561
records per second for 16000 records = 194.8970818162883
records per second for 18000 records = 195.42045955153594
records per second for 20000 records = 194.8323183383171
records per second for 22000 records = 195.50455642812597
records per second for 24000 records = 196.46647909543066
records per second for 26000 records = 197.21566950271904
records per second for 28000 records = 197.82566871080405
records per second for 30000 records = 198.04935152779004
records per second for 32000 records = 199.10202683035618
num records 32561 in 161.21302632600054
records per second = 201.97499384544795

