In [1]:
#imports

from itertools import product
import operator

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import altair as alt
alt.renderers.enable("notebook")

RendererRegistry.enable('notebook')

In [37]:
#global vars
NUM_QUANTILES = 10

In [3]:
#get dataset
dataBunch = datasets.load_breast_cancer(return_X_y=False)
data_to_use = {"data": dataBunch.data[:,:10],
               "target": dataBunch.target,
               "feature_names": dataBunch.feature_names[:10],
               "feature_locs": {x:i for i,x in enumerate(dataBunch.feature_names[:10])}
              } #make it easy to change out data

In [154]:
dataCervicalCancer = pd.read_csv("data/cervical_cancer.csv")
target = dataCervicalCancer.Biopsy
dataCervicalCancer = dataCervicalCancer.drop(["Person", "Biopsy"],axis=1)
data_to_use = {"data": dataCervicalCancer.values,
               "target": target,
               "feature_names": dataCervicalCancer.columns,
               "feature_locs": {x:i for i,x in enumerate(dataCervicalCancer.columns)}   
              }

In [156]:
#build classifier to explain
classifier = RandomForestClassifier(n_estimators=100, max_depth=2)
classifier.fit(data_to_use["data"], data_to_use["target"])
predictions = classifier.predict(data_to_use["data"])
print accuracy_score(data_to_use["target"], predictions)

0.9358974358974359


In [282]:
#generate data structure for pairwise charts
feature_pairs = {key: {"map":None, "predicates":[]}\
                 for key in set([tuple(sorted(t)) for t in product(data_to_use["feature_names"], repeat=2)])}
feature_ranges = {}
for feature in data_to_use["feature_names"]:
    loc = data_to_use["feature_locs"][feature]
    if np.unique(data_to_use["data"][loc]).shape[0] < 10 or type(data_to_use["data"][loc][0]) is str:
        quantiles = np.unique(data_to_use["data"][loc])
    else:
        quantiles = np.around(np.unique(np.quantile(a=data_to_use["data"][loc], q=np.linspace(0, 1, NUM_QUANTILES))),4)
    feature_ranges[feature] = quantiles

def get_quantile_matrix(feat1, feat2):
    h = list(feature_ranges[feat1])*len(feature_ranges[feat2])
    v = [item for item in list(feature_ranges[feat1])\
         for i in range(len(feature_ranges[feat2]))]                        
    return h,v 
    
for key, value in feature_pairs.iteritems():
    h, v = get_quantile_matrix(key[0], key[1])
    value["map"]=np.array([{key[0]:x, key[1]:y}\
                           for x,y in zip(h,v)])\
    .reshape(len(feature_ranges[key[0]]), len(feature_ranges[key[1]]))

for model in classifier.estimators_:
    feature_ids = {i:{"number":x, "name":data_to_use["feature_names"][x]}\
                      for i,x in enumerate(list(model.tree_.feature)) if x>=0} #-2 means leaf node
    
    #need to refactor this part to condense code and prepare for handling higher-depth trees
    
    #for 1-layer trees
    if model.tree_.feature[1] <0:
        feature_pair_key = tuple(sorted([feature_ids[0]["name"], feature_ids[0]["name"]]))
        decision_func_dict = {
            "feature_name": feature_ids[0]["name"],
            "threshold": model.tree_.threshold[0],
            "operator": operator.le,
            "prob_le": model.tree_.value[1][0][1]/(model.tree_.value[1][0][1] + model.tree_.value[1][0][0]),
            "prob_gt": model.tree_.value[2][0][1]/(model.tree_.value[2][0][1] + model.tree_.value[2][0][0])
        }       
        #build the predictive function used in the decision tree
        def dt_predicate(data_case, decision_func_dict=decision_func_dict):
            if decision_func_dict["operator"](\
                                                data_case[decision_func_dict["feature_name"]],\
                                                decision_func_dict["threshold"]\
                                               ):
                return decision_func_dict["prob_le"]
            else:
                return decision_func_dict["prob_gt"]        
    else:
        for node_position in [1,4]: #positions for left and right nodes at layer 2
            if node_position in feature_ids:
                feature_pair_key = tuple(sorted([feature_ids[0]["name"], feature_ids[node_position]["name"]]))
                #get the decision rules
                decision_func_dict = {
                    "feature_name_1": feature_ids[0]["name"],
                    "threshold_1": model.tree_.threshold[0],

                    "operator_1": operator.le if node_position == 1 else operator.gt,

                    "feature_name_2": feature_ids[node_position]["name"],
                    "threshold_2": model.tree_.threshold[node_position],

                    "operator_2": operator.le,

                    "prob_le": model.tree_.value[node_position+1][0][1]/\
                    (model.tree_.value[node_position+1][0][1] + model.tree_.value[node_position+1][0][0]),

                    "prob_gt": model.tree_.value[node_position+2][0][1]/\
                    (model.tree_.value[node_position+2][0][1] + model.tree_.value[node_position+2][0][0])
                }
                #build the predictive function used in the decision tree
                def dt_predicate(data_case, decision_func_dict=decision_func_dict):
                    if decision_func_dict["operator_1"](\
                                                        data_case[decision_func_dict["feature_name_1"]],\
                                                        decision_func_dict["threshold_1"]\
                                                       ):
                        if decision_func_dict["operator_2"](\
                                                            data_case[decision_func_dict["feature_name_2"]],\
                                                            decision_func_dict["threshold_2"]\
                                                           ):
                            return decision_func_dict["prob_le"]
                        else:
                            return decision_func_dict["prob_gt"]
                    else:
                        return 0.
            
            else: #asymmetric tree, this is a leaf node
                feature_pair_key = tuple(sorted([feature_ids[0]["name"], feature_ids[0]["name"]]))
                decision_func_dict = {
                    "feature_name": feature_ids[0]["name"],
                    "threshold": model.tree_.threshold[0],
                    "operator": operator.le if node_position == 1 else operator.gt,
                    
                    "prob_le": 1. if node_position == 1 else 0.,
                    
                    "prob_gt": 0. if node_position == 1 else 1.,
                }       
                #build the predictive function used in the decision tree
                def dt_predicate(data_case, decision_func_dict=decision_func_dict):
                    if decision_func_dict["operator"](\
                                                        data_case[decision_func_dict["feature_name"]],\
                                                        decision_func_dict["threshold"]\
                                                       ):
                        return decision_func_dict["prob_le"]
                    else:
                        return decision_func_dict["prob_gt"]                  

        feature_pairs[feature_pair_key]["predicates"].append(dt_predicate)

In [283]:
#now calculate output array for each feature pair
for key, value in feature_pairs.iteritems():
    arrs = []
    for predicate in value["predicates"]:
        f = np.vectorize(predicate)
        arrs.append(f(value["map"]))
    if len(arrs) > 0:
        #details of vote aggreggation method for random forest
        #hpairttps://stats.stackexchange.com/questions/127077/random-forest-probabilistic-prediction-vs-majority-vote
        value["output"] = np.sum(np.stack(arrs, axis=-1), axis=-1)
    else:
        placeholder = np.empty(value["map"].shape)
        placeholder.fill(None)
        value["output"] = placeholder

In [284]:
def get_pair_column_format(feat1, feat2, modifier=None):
    return feat1 + "," + feat2 + ("(FffT_PROP_"+modifier+")" if modifier is not None else "")

In [302]:
chart_data = {}
for key1 in feature_ranges.iterkeys():
    for key2 in feature_ranges.iterkeys():
        h,v = get_quantile_matrix(key1, key2)
        chart_data[get_pair_column_format(key1, key2, "V")] = v
        chart_data[get_pair_column_format(key1, key2, "H")] = h
for key, value in feature_pairs.iteritems():
    chart_data[get_pair_column_format(key[0], key[1])] =\
    [x if not np.isnan(x) else "IGNORE" for x in list(value["output"].ravel())]

In [303]:
offset = 0
for key, value in chart_data.iteritems():
    if "(FffT_PROP_" not in key:       
        srs = pd.Series(value)
        if value[0] == "IGNORE":
            print "no predictors found: " + key
        elif srs.apply(lambda x: x==0.).all():
            print "all zeroes: " + key
            chart_data[key] = np.array([None for x in range(len(value))])
        elif srs.std()/srs.mean() < 0.1:
            print "low std: " + key
            offset += srs.mean()
            chart_data[key] = np.array([None for x in range(len(value))])
        elif srs.max() < 1.:
            print "low impact: " + key
            chart_data[key] = np.array([None for x in range(len(value))])
            offset += srs.mean()
chart_data["dummy_x"] = [0,0,1,1]
chart_data["dummy_y"] = [0,1,0,1]
chart_data["dummy_color"] = [0,0,0,0]
print offset

all zeroes: Age,Smokes (years)
low impact: Age,Num of pregnancies
no predictors found: First sexual intercourse,Smokes (packs/year)
low impact: Age,Number of sexual partners
no predictors found: First sexual intercourse,Smokes (years)
no predictors found: IUD (years),IUD (years)
low impact: Age,IUD (years)
no predictors found: Num of pregnancies,Num of pregnancies
no predictors found: First sexual intercourse,Number of sexual partners
low impact: Number of sexual partners,Smokes (years)
low impact: Number of sexual partners,STDs (number)
all zeroes: First sexual intercourse,First sexual intercourse
no predictors found: STDs (number),STDs (number)
no predictors found: Num of pregnancies,STDs (number)
low impact: STDs (number),Smokes (packs/year)
no predictors found: Num of pregnancies,Number of sexual partners
all zeroes: Age,Age
low impact: Age,First sexual intercourse
no predictors found: IUD (years),STDs (number)
no predictors found: IUD (years),Smokes (packs/year)
low impact: Age,ST

In [305]:
chart_data

{'Age,Age': array([None, None, None, None, None, None, None, None, None, None, None,
        None, None, None, None, None, None, None, None, None, None, None,
        None, None, None], dtype=object),
 'Age,Age(FffT_PROP_H)': [0.0,
  1.0,
  4.0,
  15.0,
  18.0,
  0.0,
  1.0,
  4.0,
  15.0,
  18.0,
  0.0,
  1.0,
  4.0,
  15.0,
  18.0,
  0.0,
  1.0,
  4.0,
  15.0,
  18.0,
  0.0,
  1.0,
  4.0,
  15.0,
  18.0],
 'Age,Age(FffT_PROP_V)': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  15.0,
  15.0,
  15.0,
  15.0,
  15.0,
  18.0,
  18.0,
  18.0,
  18.0,
  18.0],
 'Age,First sexual intercourse': array([None, None, None, None, None, None, None, None, None, None, None,
        None, None, None, None, None, None, None, None, None], dtype=object),
 'Age,First sexual intercourse(FffT_PROP_H)': [0.0,
  1.0,
  4.0,
  15.0,
  18.0,
  0.0,
  1.0,
  4.0,
  15.0,
  18.0,
  0.0,
  1.0,
  4.0,
  15.0,
  18.0,
  0.0,
  1.0,
  4.0,
  15.0,
  18.0],
 

In [280]:
base = alt.Chart().properties(
    width=60,
    height=60
)

data = alt.DictInlineDataset().from_dict(chart_data)
chart = alt.vconcat(data=data)
for irow, y_feature in enumerate(data_to_use["feature_names"]):
    row = alt.hconcat()
    for icol, x_feature in enumerate(data_to_use["feature_names"]):
        if chart_data[get_pair_column_format(temp_sort[0], temp_sort[1])] is not None\
        and irow<=icol:        
            temp_sort = sorted([x_feature, y_feature])
            x_encoding = alt.X(field=get_pair_column_format(temp_sort[0], temp_sort[1],"H"),\
                               type="ordinal", sort="ascending",\
                               axis=alt.Axis(title=x_feature))
            y_encoding = alt.Y(field=get_pair_column_format(temp_sort[0], temp_sort[1],"V"),\
                               type="ordinal", sort="descending",\
                               axis=alt.Axis(title=y_feature if icol==0 else ""))

            color_encoding = alt.Color(field= get_pair_column_format(temp_sort[0], temp_sort[1]),\
                                                                         type="quantitative",\
                                                                         scale=alt.Scale(scheme="greenblue"),\
                                                                         legend=alt.Legend(title="Votes"))
        else:
            x_encoding=alt.X(field="dummy_x", type="ordinal")
            y_encoding=alt.Y(field="dummy_y", type="ordinal")
            color_encoding=alt.Color(field="dummy_color", type="quantitative")
        row |= base.mark_rect().encode(x=x_encoding, y=y_encoding, color=color_encoding)
    chart &= row
chart

ValidationError: array([None, None, None, None, None, None, None, None, None, None, None,
       None, None, None, None, None, None, None, None, None, None, None,
       None, None, None, None, None, None, None, None, None, None, None,
       None, None], dtype=object) is not valid under any of the given schemas

Failed validating u'anyOf' in schema[u'additionalProperties']:
    {u'anyOf': [{u'items': {u'type': u'number'}, u'type': u'array'},
                {u'items': {u'type': u'string'}, u'type': u'array'},
                {u'items': {u'type': u'boolean'}, u'type': u'array'},
                {u'items': {u'type': u'object'}, u'type': u'array'},
                {u'type': u'string'},
                {u'type': u'object'}]}

On instance['Age,Num of pregnancies']:
    array([None, None, None, None, None, None, None, None, None, None, None,
           None, None, None, None, None, None, None, None, None, None, None,
           None, None, None, None, None, None, None, None, None, None, None,
           None, None], dtype=object)

In [None]:
chart_data = pd.DataFrame(vis_df_dict)
offset = 0
for col in [x for x in chart_data.columns if "(FffT_PROP_" not in x]:
    srs = chart_data.loc[:,col]
    if srs.apply(lambda x: x==0.).all():
        print "all zeroes: " + col
        chart_data[col] = None
    elif srs.std()/srs.mean() < 0.1:
        print "low std: " + col
        offset += srs.mean()
        chart_data[col] = None
    elif srs.max() < 1.:
        print "low impact: " + col
        chart_data[col] = None
        offset += srs.mean()
chart_data["dummy_null"] = None
print offset

In [196]:
base = alt.Chart().properties(
    width=60,
    height=60
)

chart = alt.vconcat(data=chart_data)
for irow, y_feature in enumerate(data_to_use["feature_names"]):
    row = alt.hconcat()
    for icol, x_feature in enumerate(data_to_use["feature_names"]):
        x_encoding = alt.X(field=x_feature+"_H", type="ordinal", sort="ascending",\
                           axis=alt.Axis(title=x_feature))
        y_encoding = alt.Y(field=y_feature+"_V", type="ordinal", sort="descending",\
                           axis=alt.Axis(title=y_feature if icol==0 else ""))
        temp_sort = sorted([x_feature, y_feature])
        if chart_data.loc[:,"pair: " + temp_sort[0] + "," + temp_sort[1]].iloc[0] is not None\
        and irow<=icol:
            color_encoding = alt.Color(field= "pair: " + temp_sort[0] + "," + temp_sort[1]\
            , type="quantitative", scale=alt.Scale(scheme="greenblue"), legend=alt.Legend(title="Votes"))
        else:
            color_encoding=alt.Color(field="dummy_null", type="quantitative")
        row |= base.mark_rect().encode(x=x_encoding, y=y_encoding, color=color_encoding)
    chart &= row
chart

AttributeError: 'dict' object has no attribute 'loc'