Load libraries

In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import requests
import time
from tqdm import tqdm_notebook as tqdm

In [2]:
from sklearn.preprocessing import LabelEncoder
import sklearn.model_selection

Get adult data

In [3]:
# Helepr function to process the data
def map_array_values(series, value_map):
    if series.dtype == 'object':
        ret = series.str.strip().copy()
    else:
        ret = series.copy()
    for src, target in value_map.items():
        ret[ret == src] = target
    return ret


# Readble feature names
feature_names = ["Age", "Workclass", "fnlwgt", "Education",
                 "Education-Num", "Marital Status", "Occupation",
                 "Relationship", "Race", "Sex", "Capital Gain",
                 "Capital Loss", "Hours per week", "Country", 'Income']
features_to_use = [0, 1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13]
categorical_features = [1, 3, 5, 6, 7, 8, 9, 10, 11, 13]
education_map = {
    '10th': 'Dropout', '11th': 'Dropout', '12th': 'Dropout', '1st-4th':
        'Dropout', '5th-6th': 'Dropout', '7th-8th': 'Dropout', '9th':
        'Dropout', 'Preschool': 'Dropout', 'HS-grad': 'High School grad',
    'Some-college': 'High School grad', 'Masters': 'Masters',
    'Prof-school': 'Prof-School', 'Assoc-acdm': 'Associates',
    'Assoc-voc': 'Associates',
}
occupation_map = {
    "Adm-clerical": "Admin", "Armed-Forces": "Military",
    "Craft-repair": "Blue-Collar", "Exec-managerial": "White-Collar",
    "Farming-fishing": "Blue-Collar", "Handlers-cleaners":
        "Blue-Collar", "Machine-op-inspct": "Blue-Collar", "Other-service":
        "Service", "Priv-house-serv": "Service", "Prof-specialty":
        "Professional", "Protective-serv": "Other", "Sales":
        "Sales", "Tech-support": "Other", "Transport-moving":
        "Blue-Collar",
}
country_map = {
    'Cambodia': 'SE-Asia', 'Canada': 'British-Commonwealth', 'China':
        'China', 'Columbia': 'South-America', 'Cuba': 'Other',
    'Dominican-Republic': 'Latin-America', 'Ecuador': 'South-America',
    'El-Salvador': 'South-America', 'England': 'British-Commonwealth',
    'France': 'Euro_1', 'Germany': 'Euro_1', 'Greece': 'Euro_2',
    'Guatemala': 'Latin-America', 'Haiti': 'Latin-America',
    'Holand-Netherlands': 'Euro_1', 'Honduras': 'Latin-America',
    'Hong': 'China', 'Hungary': 'Euro_2', 'India':
        'British-Commonwealth', 'Iran': 'Other', 'Ireland':
        'British-Commonwealth', 'Italy': 'Euro_1', 'Jamaica':
        'Latin-America', 'Japan': 'Other', 'Laos': 'SE-Asia', 'Mexico':
        'Latin-America', 'Nicaragua': 'Latin-America',
    'Outlying-US(Guam-USVI-etc)': 'Latin-America', 'Peru':
        'South-America', 'Philippines': 'SE-Asia', 'Poland': 'Euro_2',
    'Portugal': 'Euro_2', 'Puerto-Rico': 'Latin-America', 'Scotland':
        'British-Commonwealth', 'South': 'Euro_2', 'Taiwan': 'China',
    'Thailand': 'SE-Asia', 'Trinadad&Tobago': 'Latin-America',
    'United-States': 'United-States', 'Vietnam': 'SE-Asia'
}
married_map = {
    'Never-married': 'Never-Married', 'Married-AF-spouse': 'Married',
    'Married-civ-spouse': 'Married', 'Married-spouse-absent':
        'Separated', 'Separated': 'Separated', 'Divorced':
        'Separated', 'Widowed': 'Widowed'
}


# Transform Continiouse Cap Gains into the discrete variable
def cap_gains_fn(x):
    x = x.astype(float)
    d = np.digitize(x, [0, np.median(x[x > 0]), float('inf')], right=True)
    new_series = pd.Series(["None"] * len(d))
    new_series[d == 0] = 'None'
    new_series[d == 1] = 'Low'
    new_series[d == 2] = 'High'
    return new_series

# Specify transofrmations for each column
transformations = {
    'Education': lambda x: map_array_values(x, education_map),
    'Marital Status': lambda x: map_array_values(x, married_map),
    'Occupation': lambda x: map_array_values(x, occupation_map),
    'Capital Gain': cap_gains_fn,
    'Capital Loss': cap_gains_fn,
    'Country': lambda x: map_array_values(x, country_map),
}

# Load df
df = pd.read_csv("../anchor2/anchor2/examples/data/adult/adult.data", header=None)
df.columns = feature_names
target_labels = pd.Series(df.iloc[:, -1], index=df.index)
df = df.iloc[:, features_to_use]
df.dropna(inplace=True)

# Transform features
for feature, fun in transformations.items():
    df[feature] = fun(df[feature])

# Store dictionary with {Category id -> category classes}
categorical_features_idx = [1, 2, 3, 4, 5, 6, 7, 8, 9, 11]
categorical_names = {}  # Dictionary with (Category id -> category classes)
for f_idx in categorical_features_idx:
    le = LabelEncoder()
    df.iloc[:, f_idx] = le.fit_transform(df.iloc[:, f_idx])
    categorical_names[f_idx] = le.classes_

# Encode target label
le = LabelEncoder()
target_labels = le.fit_transform(target_labels)
class_names = list(le.classes_)

# Split the dataset into train\val\test
train_X, rest_X, train_y, rest_y = sklearn.model_selection.train_test_split(df, target_labels, stratify=target_labels,
                                                                            test_size=0.5, random_state=42)
val_X, test_X, val_y, test_y = sklearn.model_selection.train_test_split(rest_X, rest_y, stratify=rest_y,
                                                                        test_size=0.5, random_state=42)

Check random sample

In [4]:
idx = np.random.choice(range(test_X.shape[0]))
x = test_X.iloc[idx]
print(x)

Age               38
Workclass          4
Education          4
Marital Status     1
Occupation         7
Relationship       3
Race               2
Sex                1
Capital Gain       2
Capital Loss       2
Hours per week    40
Country            9
Name: 17839, dtype: int64


Specify the hydrosphere inference endpoint

In [5]:
service_link = "https://dev.k8s.hydrosphere.io/gateway/application/adult-salary-app"

In [6]:
# Function to store sample in a json with signature specified by Ilnur
def make_signatured_json(sample):
    output_json = {}
    if type(sample) == pd.Series:
        feature_names = sample.index
        values = sample
    else:
        feature_names = [
            "Age",
            "Workclass",
            "Education",
            "Marital Status",
            "Occupation",
            "Relationship",
            "Race",
            "Sex",
            "Capital Gain",
            "Capital Loss",
            "Hours per week",
            "Country"]
        values = sample
        
    for feature_idx, fname in enumerate(feature_names):
        output_json[fname] = [int(v) for v in values.loc[:, fname]]
    return output_json


Get response for an example

In [7]:
response = requests.post(url=service_link, json=make_signatured_json(test_X.iloc[idx]))
prediction = np.array(response.json()["Prediction"])
print(f"Predicted label is {class_names[prediction[0][0]]}")

Predicted label is  <=50K


In [7]:
response = requests.post(url=service_link, json=make_signatured_json(test_X.iloc[idx:idx+500]))
prediction = np.array(response.json()["Prediction"])
print(f"Predicted label is {class_names[prediction[0][0]]}")

Predicted label is  <=50K


Specify the link to Anchor explanation service

In [14]:
anchor_link = "http://0.0.0.0:5000/anchor" 

Prepare configs

In [9]:
adult_anchor_config = {
  "precision_threshold": 0.95,
  "verbose": False,
  "ordinal_features_idx": [
    0,
    10
  ],
  "oh_encoded_categories": {},
  "label_decoders": {
    "1": [
      " ?",
      " Federal-gov",
      " Local-gov",
      " Never-worked",
      " Private",
      " Self-emp-inc",
      " Self-emp-not-inc",
      " State-gov",
      " Without-pay"
    ],
    "2": [
      "Associates",
      "Bachelors",
      "Doctorate",
      "Dropout",
      "High School grad",
      "Masters",
      "Prof-School"
    ],
    "3": [
      "Married",
      "Never-Married",
      "Separated",
      "Widowed"
    ],
    "4": [
      "?",
      "Admin",
      "Blue-Collar",
      "Military",
      "Other",
      "Professional",
      "Sales",
      "Service",
      "White-Collar"
    ],
    "5": [
      " Husband",
      " Not-in-family",
      " Other-relative",
      " Own-child",
      " Unmarried",
      " Wife"
    ],
    "6": [
      " Amer-Indian-Eskimo",
      " Asian-Pac-Islander",
      " Black",
      " Other",
      " White"
    ],
    "7": [
      " Female",
      " Male"
    ],
    "8": [
      "High",
      "Low",
      "None"
    ],
    "9": [
      "High",
      "Low",
      "None"
    ],
    "11": [
      "?",
      "British-Commonwealth",
      "China",
      "Euro_1",
      "Euro_2",
      "Latin-America",
      "Other",
      "SE-Asia",
      "South-America",
      "United-States",
      "Yugoslavia"
    ]
  },
  "strategy": "kl-lucb",
  "feature_names": [
    "Age",
    "Workclass",
    "Education",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Race",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country"
  ]
}


Populate reqstore with data

In [10]:
service_link = "https://dev.k8s.hydrosphere.io/gateway/application/adult-salary-app"
for idx in tqdm(range(test_X.shape[0]//500 - 1)):
    sample = test_X.iloc[idx*500 :(idx +1)*500]    
    response = requests.post(url=service_link, json=make_signatured_json(sample))
    print(response.status_code, end=".. ")
#     time.sleep(0.05)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 


Send a request for explanation with our sample

### Example 1

In [25]:
response = requests.post(url=anchor_link, json={"explained_instance": x.tolist(),
                                                "application_name" : "adult-salary-app",
                                                "config" : adult_anchor_config})

In [26]:
print(response.text)

{
  "coverage": 0.328, 
  "explanation": "Marital Status == Never-Married", 
  "precision": 0.967
}



In [27]:
print("Coverage of this explanation is ", response.json()['coverage'])
print("Precision of this explanation is ", response.json()['precision'])

Coverage of this explanation is  0.328
Precision of this explanation is  0.967


In [28]:
response.json()['explanation'].split(" AND ")

['Marital Status == Never-Married']

### Example 2

In [30]:
idx = np.random.choice(range(test_X.shape[0]))
x = test_X.iloc[idx]
print(x)

Age               56
Workclass          2
Education          1
Marital Status     0
Occupation         5
Relationship       0
Race               4
Sex                1
Capital Gain       0
Capital Loss       2
Hours per week    40
Country            9
Name: 26327, dtype: int64


In [31]:
response = requests.post(url=service_link, json=make_signatured_json(test_X.iloc[idx:idx+500]))
prediction = np.array(response.json()["Prediction"])
print(f"Predicted label is {class_names[prediction[0][0]]}")

Predicted label is  >50K


In [32]:
response = requests.post(url=anchor_link, json={"explained_instance": x.tolist(),
                                                "application_name" : "adult-salary-app",
                                                "config" : adult_anchor_config})
print(response.text)

{
  "coverage": 0.038, 
  "explanation": "Capital Gain == High", 
  "precision": 0.996
}



### Example 3

In [39]:
idx = np.random.choice(range(test_X.shape[0]))
x = test_X.iloc[idx]
print(x)

Age               37
Workclass          4
Education          1
Marital Status     0
Occupation         4
Relationship       0
Race               4
Sex                1
Capital Gain       2
Capital Loss       2
Hours per week    50
Country            9
Name: 9204, dtype: int64


In [49]:
{"explained_instance": x.tolist(),
                                                "application_name" : "adult-salary-app",
                                                "config" : adult_anchor_config}

{'explained_instance': [37, 4, 1, 0, 4, 0, 4, 1, 2, 2, 50, 9],
 'application_name': 'adult-salary-app',
 'config': {'precision_threshold': 0.95,
  'verbose': True,
  'ordinal_features_idx': [0, 10],
  'oh_encoded_categories': {},
  'label_decoders': {'1': [' ?',
    ' Federal-gov',
    ' Local-gov',
    ' Never-worked',
    ' Private',
    ' Self-emp-inc',
    ' Self-emp-not-inc',
    ' State-gov',
    ' Without-pay'],
   '2': ['Associates',
    'Bachelors',
    'Doctorate',
    'Dropout',
    'High School grad',
    'Masters',
    'Prof-School'],
   '3': ['Married', 'Never-Married', 'Separated', 'Widowed'],
   '4': ['?',
    'Admin',
    'Blue-Collar',
    'Military',
    'Other',
    'Professional',
    'Sales',
    'Service',
    'White-Collar'],
   '5': [' Husband',
    ' Not-in-family',
    ' Other-relative',
    ' Own-child',
    ' Unmarried',
    ' Wife'],
   '6': [' Amer-Indian-Eskimo',
    ' Asian-Pac-Islander',
    ' Black',
    ' Other',
    ' White'],
   '7': [' Female', ' 

In [40]:
response = requests.post(url=service_link, json=make_signatured_json(test_X.iloc[idx:idx+500]))
prediction = np.array(response.json()["Prediction"])
print(f"Predicted label is {class_names[prediction[0][0]]}")

Predicted label is  >50K


In [None]:
adult_anchor_config.update({"verbose": True})

In [48]:
response = requests.post(url=anchor_link, json={"explained_instance": x.tolist(),
                                                "application_name" : "adult-salary-app",
                                                "config" : adult_anchor_config})
print(response.text)

KeyboardInterrupt: 