Load libraries

In [209]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import requests
import time
from tqdm import tqdm_notebook as tqdm

In [210]:
from sklearn.preprocessing import LabelEncoder
import sklearn.model_selection

Get adult data

In [3]:
# Helepr function to process the data
def map_array_values(series, value_map):
    if series.dtype == 'object':
        ret = series.str.strip().copy()
    else:
        ret = series.copy()
    for src, target in value_map.items():
        ret[ret == src] = target
    return ret


# Readble feature names
feature_names = ["Age", "Workclass", "fnlwgt", "Education",
                 "Education-Num", "Marital Status", "Occupation",
                 "Relationship", "Race", "Sex", "Capital Gain",
                 "Capital Loss", "Hours per week", "Country", 'Income']
features_to_use = [0, 1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13]
categorical_features = [1, 3, 5, 6, 7, 8, 9, 10, 11, 13]
education_map = {
    '10th': 'Dropout', '11th': 'Dropout', '12th': 'Dropout', '1st-4th':
        'Dropout', '5th-6th': 'Dropout', '7th-8th': 'Dropout', '9th':
        'Dropout', 'Preschool': 'Dropout', 'HS-grad': 'High School grad',
    'Some-college': 'High School grad', 'Masters': 'Masters',
    'Prof-school': 'Prof-School', 'Assoc-acdm': 'Associates',
    'Assoc-voc': 'Associates',
}
occupation_map = {
    "Adm-clerical": "Admin", "Armed-Forces": "Military",
    "Craft-repair": "Blue-Collar", "Exec-managerial": "White-Collar",
    "Farming-fishing": "Blue-Collar", "Handlers-cleaners":
        "Blue-Collar", "Machine-op-inspct": "Blue-Collar", "Other-service":
        "Service", "Priv-house-serv": "Service", "Prof-specialty":
        "Professional", "Protective-serv": "Other", "Sales":
        "Sales", "Tech-support": "Other", "Transport-moving":
        "Blue-Collar",
}
country_map = {
    'Cambodia': 'SE-Asia', 'Canada': 'British-Commonwealth', 'China':
        'China', 'Columbia': 'South-America', 'Cuba': 'Other',
    'Dominican-Republic': 'Latin-America', 'Ecuador': 'South-America',
    'El-Salvador': 'South-America', 'England': 'British-Commonwealth',
    'France': 'Euro_1', 'Germany': 'Euro_1', 'Greece': 'Euro_2',
    'Guatemala': 'Latin-America', 'Haiti': 'Latin-America',
    'Holand-Netherlands': 'Euro_1', 'Honduras': 'Latin-America',
    'Hong': 'China', 'Hungary': 'Euro_2', 'India':
        'British-Commonwealth', 'Iran': 'Other', 'Ireland':
        'British-Commonwealth', 'Italy': 'Euro_1', 'Jamaica':
        'Latin-America', 'Japan': 'Other', 'Laos': 'SE-Asia', 'Mexico':
        'Latin-America', 'Nicaragua': 'Latin-America',
    'Outlying-US(Guam-USVI-etc)': 'Latin-America', 'Peru':
        'South-America', 'Philippines': 'SE-Asia', 'Poland': 'Euro_2',
    'Portugal': 'Euro_2', 'Puerto-Rico': 'Latin-America', 'Scotland':
        'British-Commonwealth', 'South': 'Euro_2', 'Taiwan': 'China',
    'Thailand': 'SE-Asia', 'Trinadad&Tobago': 'Latin-America',
    'United-States': 'United-States', 'Vietnam': 'SE-Asia'
}
married_map = {
    'Never-married': 'Never-Married', 'Married-AF-spouse': 'Married',
    'Married-civ-spouse': 'Married', 'Married-spouse-absent':
        'Separated', 'Separated': 'Separated', 'Divorced':
        'Separated', 'Widowed': 'Widowed'
}


# Transform Continiouse Cap Gains into the discrete variable
def cap_gains_fn(x):
    x = x.astype(float)
    d = np.digitize(x, [0, np.median(x[x > 0]), float('inf')], right=True)
    new_series = pd.Series(["None"] * len(d))
    new_series[d == 0] = 'None'
    new_series[d == 1] = 'Low'
    new_series[d == 2] = 'High'
    return new_series

# Specify transofrmations for each column
transformations = {
    'Education': lambda x: map_array_values(x, education_map),
    'Marital Status': lambda x: map_array_values(x, married_map),
    'Occupation': lambda x: map_array_values(x, occupation_map),
    'Capital Gain': cap_gains_fn,
    'Capital Loss': cap_gains_fn,
    'Country': lambda x: map_array_values(x, country_map),
}

# Load df
df = pd.read_csv("../anchor2/anchor2/examples/data/adult/adult.data", header=None)
df.columns = feature_names
target_labels = pd.Series(df.iloc[:, -1], index=df.index)
df = df.iloc[:, features_to_use]
df.dropna(inplace=True)

# Transform features
for feature, fun in transformations.items():
    df[feature] = fun(df[feature])

# Store dictionary with {Category id -> category classes}
categorical_features_idx = [1, 2, 3, 4, 5, 6, 7, 8, 9, 11]
categorical_names = {}  # Dictionary with (Category id -> category classes)
for f_idx in categorical_features_idx:
    le = LabelEncoder()
    df.iloc[:, f_idx] = le.fit_transform(df.iloc[:, f_idx])
    categorical_names[f_idx] = le.classes_

# Encode target label
le = LabelEncoder()
target_labels = le.fit_transform(target_labels)
class_names = list(le.classes_)

# Split the dataset into train\val\test
train_X, rest_X, train_y, rest_y = sklearn.model_selection.train_test_split(df, target_labels, stratify=target_labels,
                                                                            test_size=0.5, random_state=42)
val_X, test_X, val_y, test_y = sklearn.model_selection.train_test_split(rest_X, rest_y, stratify=rest_y,
                                                                        test_size=0.5, random_state=42)

Check random sample

In [4]:
idx = np.random.choice(range(test_X.shape[0]))
x = test_X.iloc[idx]
print(x)

Age               36
Workclass          4
Education          4
Marital Status     0
Occupation         7
Relationship       0
Race               4
Sex                1
Capital Gain       2
Capital Loss       2
Hours per week    50
Country            9
Name: 14971, dtype: int64


Specify the hydrosphere inference endpoint

In [5]:
service_link = "https://dev.k8s.hydrosphere.io/gateway/application/adult-salary-app"

In [6]:
# Function to store sample in a json with signature specified by Ilnur
def make_signatured_json(sample):
    output_json = {}
    if type(sample) == pd.Series:
        feature_names = sample.index
        values = sample
    else:
        feature_names = [
            "Age",
            "Workclass",
            "Education",
            "Marital Status",
            "Occupation",
            "Relationship",
            "Race",
            "Sex",
            "Capital Gain",
            "Capital Loss",
            "Hours per week",
            "Country"]
        values = sample
        
    for feature_idx, fname in enumerate(feature_names):
        output_json[fname] = [int(v) for v in values.loc[:, fname]]
    return output_json


Get response for an example

In [7]:
response = requests.post(url=service_link, json=make_signatured_json(test_X.iloc[idx]))
prediction = np.array(response.json()["Prediction"])
print(f"Predicted label is {class_names[prediction[0][0]]}")

Predicted label is  <=50K


In [7]:
response = requests.post(url=service_link, json=make_signatured_json(test_X.iloc[idx:idx+500]))
prediction = np.array(response.json()["Prediction"])
print(f"Predicted label is {class_names[prediction[0][0]]}")

Predicted label is  <=50K


Specify the link to Anchor explanation service

In [8]:
anchor_link = "http://0.0.0.0:5000/anchor2_adult" 

Anchor service is already configured for this application, we just need to pass explained service and link to the reqstore

Populate reqstore with data

In [138]:
for idx in tqdm(range(test_X.shape[0])):
    response = requests.post(url=service_link, json=make_signatured_json(test_X.iloc[idx]))
#     time.sleep(0.05)

HBox(children=(IntProgress(value=0, max=8141), HTML(value='')))

In [212]:
service_link = "https://dev.k8s.hydrosphere.io/gateway/application/adult-salary-app"
for idx in tqdm(range(test_X.shape[0]//500 - 1)):
    sample = test_X.iloc[idx*500 :(idx +1)*500]    
    response = requests.post(url=service_link, json=make_signatured_json(sample))
    print(response.status_code, end=".. ")
#     time.sleep(0.05)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 200.. 

Send a request for explanation with our sample

In [10]:
response = requests.post(url=anchor_link, json={"sample": [x.tolist()]})  # Send image in json

In [39]:
# print(response.text)

In [None]:
print("Coverage of this explanation is ", response.json()['coverage'])
print("Precision of this explanation is ", response.json()['precision'])

In [None]:
response.json()['explanation'].split(" AND ")

## TEST REQSTORE - SOMEBODY STOLE MY DATA

In [151]:
import sys
sys.path.append("..")

application_name = "adult-salary-app"
service_link = "https://dev.k8s.hydrosphere.io/api/v2/application/"
reqstore_url = "https://dev.k8s.hydrosphere.io/reqstore/"

r = requests.get(service_link + application_name)
application_id = str(r.json()['executionGraph']['stages'][0]['modelVariants'][0]['modelVersion']["id"])
application_id

'54'

In [152]:
b_data = APIHelper.download(reqstore_url, application_id)
records = BinaryHelper.decode_records(b_data)


In [208]:
all_entries[-4].request



In [173]:
# all_entries = reduce(list.__add__, map(lambda r: r.entries, records))
reqstore_requests  = []
for entry in all_entries:
    try:
        r = entry.request
        reqstore_requests.append(r)
    except:
        print(-1)
        pass
        

-1
-1


In [204]:
reqstore_requests[100]



In [195]:
# TODO Fix processing of requests which has resulted in error
# Helper functions to translate requests into pd.Series
def request_to_df(r):
    columns = []
    values = []
    for key, value in r.inputs.items():
        columns.append(key)
        if len(value.int64_val) == 0:
            pass
#             values.append(np.NAN)  # FIXME why this field is missing?
        else:
            values.append(value.int64_val)
#     print(np.array(values).T)
    columns = list(filter(lambda x: x!='', columns))
    print(columns)

    return pd.DataFrame(columns=columns, data=np.array(values).T)

In [196]:
x = list(map(request_to_df, reqstore_requests))

['Marital Status']
[]


ValueError: Empty data passed with indices specified.

In [156]:
# x = pd.concat(x, sort=False)

# # Remove [1, 1, 1, ...] which results from UI test model calls
# x = x.loc[~np.all(x == np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), axis=1)]
# x.dropna(inplace=True)
# x.drop_duplicates(inplace=True)

In [154]:
from io import BytesIO
from itertools import chain
from urllib.parse import urljoin

import hydro_serving_grpc as hs
import requests


class APIHelper:

    @staticmethod
    def _urljoin(address, name, path):
        return urljoin(address, "/".join([name, path]))

    @staticmethod
    def download(address, name):
        url = APIHelper._urljoin(address, name, "get")
        r = requests.get(url, stream=True)
        return r.content


class BinaryHelper:

    @staticmethod
    def read_int(binary: BytesIO):
        return int.from_bytes(binary.read(4), byteorder='big')

    @staticmethod
    def read_long(binary: BytesIO):
        return int.from_bytes(binary.read(8), byteorder='big')

    @staticmethod
    def read_message(binary: BytesIO, grpc_msg):
        header = binary.read(1)
        data = b'' if header == 0 else binary.read()
        grpc_msg.ParseFromString(data)
        return grpc_msg

    @staticmethod
    def decode_records(data: bytes):
        bio = BytesIO(data)
        size = len(data)
        records = []
        while size > 0:
            length = BinaryHelper.read_int(bio)
            ts = BinaryHelper.read_long(bio)
            body = BytesIO(bio.read(length))
            entries = BinaryHelper.decode_entries(body)
            records.append(TsRecord(ts, entries))
            size = size - length - 4 - 8
        return records

    @staticmethod
    def decode_entries(binary: BytesIO):
        minUid = BinaryHelper.read_long(binary)
        count = BinaryHelper.read_int(binary)
        entries = []
        for i in range(count):
            length = BinaryHelper.read_int(binary)
            data = binary.read(length)
            entry = Entry(minUid + i, data)
            entries.append(entry)
        return entries

    @staticmethod
    def decode_request(data: bytes):
        return BinaryHelper.read_message(BytesIO(data), hs.PredictRequest())

    @staticmethod
    def decode_response(data: bytes):
        bio = BytesIO(data)
        offset = BinaryHelper.read_int(bio)
        print(offset)
        data = BytesIO(bio.read())

        if offset == 2:
            return BinaryHelper.read_message(bio, hs.monitoring.ExecutionError())

        elif offset == 3:
            return BinaryHelper.read_message(data, hs.PredictResponse())
        raise UnicodeDecodeError


class Entry:
    def __init__(self, uid, data):
        self.uid = uid
        self.binary = data
        self.__request = None
        self.__response = None

    @property
    def request(self):
        if not self.__request:
            self._read_binary()
        return self.__request

    @property
    def response(self):
        if not self.__response:
            self._read_binary()
        return self.__response

    def _read_binary(self):
        bio = BytesIO(self.binary)

        request_size = BinaryHelper.read_int(bio)
        response_size = BinaryHelper.read_int(bio)
        self.__request = BinaryHelper.decode_request(bio.read(request_size))
        self.__response = BinaryHelper.decode_response(bio.read(response_size))

    def __repr__(self):
        return f"Entry(uid={self.uid})"


class TsRecord:
    def __init__(self, ts, entries):
        self.ts = ts
        self.entries = entries

    def __repr__(self):
        return f"Record(ts={self.ts}, entries={self.entries})"


def join_entries(records: [TsRecord]):
    return list(chain(*[item.entries for item in records]))
