In [96]:
#imports

from itertools import product
import operator

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [41]:
#global vars
NUM_QUANTILES = 10

In [43]:
#get dataset
dataBunch = datasets.load_breast_cancer(return_X_y=False)
data_to_use = {"data": dataBunch.data[:,:10],
               "target": dataBunch.target,
               "feature_names": dataBunch.feature_names[:10],
               "feature_locs": {x:i for i,x in enumerate(dataBunch.feature_names[:10])}
              } #make it easy to change out data

In [44]:
#build classifier to explain
classifier = RandomForestClassifier(n_estimators=200, max_depth=2)
classifier.fit(data_to_use["data"], data_to_use["target"])
predictions = classifier.predict(data_to_use["data"])
print accuracy_score(data_to_use["target"], predictions)

0.9420035149384886


In [107]:
#generate data structure for pairwise charts
feature_pairs = {key: {"map":None, "predicates":[]}\
                 for key in set([tuple(sorted(t)) for t in product(data_to_use["feature_names"], repeat=2)])}
for key in feature_pairs.iterkeys():
    x_loc = data_to_use["feature_locs"][key[0]]
    y_loc = data_to_use["feature_locs"][key[1]]
    x_quantiles = np.unique(np.quantile(a=data_to_use["data"][x_loc], q=np.linspace(0, 1, NUM_QUANTILES)))
    y_quantiles = np.unique(np.quantile(a=data_to_use["data"][y_loc], q=np.linspace(0, 1, NUM_QUANTILES)))
    #y_quantiles = data_to_use["data"][y_loc].quantile(np.linspace(0, 1, NUM_QUANTILES)).unique()
    def get_quantile_tuple(x,y):
        return {"x": x_quantiles[int(x)], "y": x_quantiles[int(y)]}
    feature_pairs[key]["map"] = np.fromfunction(np.vectorize(get_quantile_tuple),\
                                                shape=(NUM_QUANTILES, NUM_QUANTILES))
for model in classifier.estimators_:
    feature_ids = {i:{"number":x, "name":data_to_use["feature_names"][x]}\
                      for i,x in enumerate(list(model.tree_.feature)) if x>=0} #-2 means leaf node
    for node_position in [1,4]: #positions for left and right nodes at layer 2
        feature_pair_key = tuple(sorted([feature_ids[0]["name"], feature_ids[node_position]["name"]]))
        #get the decision rules
        decision_func_dict = {
            "feature_name_1": feature_ids[0]["name"],
            "threshold_1": model.tree_.threshold[0],
            
            "operator_1": operator.le if node_position == 1 else operator.gt,
            
            "feature_name_2": feature_ids[node_position]["name"],
            "threshold_2": model.tree_.threshold[node_position],
            
            "operator_2": operator.le,
            
            "prob_le": model.tree_.value[node_position+1][0][1]/\
            (model.tree_.value[node_position+1][0][1] + model.tree_.value[node_position+1][0][0]),
            
            "prob_gt": model.tree_.value[node_position+2][0][1]/\
            (model.tree_.value[node_position+2][0][1] + model.tree_.value[node_position+2][0][0])
        }
        def dt_predicate(data_case, decision_func_dict=decision_func_dict):
            if decision_func_dict["operator_1"](\
                                                data_case[decision_func_dict["feature_name_1"]],\
                                                decision_func_dict["threshold_1"]\
                                               ):
                if decision_func_dict["operator_2"](\
                                                    data_case[decision_func_dict["feature_name_2"]],\
                                                    decision_func_dict["threshold_2"]\
                                                   ):
                    return decision_func_dict["prob_le"]
                else:
                    return decision_func_dict["prob_gt"]
            else:
                return 0.

        feature_pairs[feature_pair_key]["predicates"].append(dt_predicate)

In [108]:
feature_pairs

{('mean area',
  'mean area'): {'map': array([[{'y': 0.09744, 'x': 0.09744}, {'y': 0.1052, 'x': 0.09744},
          {'y': 0.1425, 'x': 0.09744}, {'y': 0.2414, 'x': 0.09744},
          {'y': 0.2597, 'x': 0.09744}, {'y': 0.2839, 'x': 0.09744},
          {'y': 11.42, 'x': 0.09744},
          {'y': 20.379999999999992, 'x': 0.09744},
          {'y': 77.58, 'x': 0.09744}, {'y': 386.1, 'x': 0.09744}],
         [{'y': 0.09744, 'x': 0.1052}, {'y': 0.1052, 'x': 0.1052},
          {'y': 0.1425, 'x': 0.1052}, {'y': 0.2414, 'x': 0.1052},
          {'y': 0.2597, 'x': 0.1052}, {'y': 0.2839, 'x': 0.1052},
          {'y': 11.42, 'x': 0.1052},
          {'y': 20.379999999999992, 'x': 0.1052},
          {'y': 77.58, 'x': 0.1052}, {'y': 386.1, 'x': 0.1052}],
         [{'y': 0.09744, 'x': 0.1425}, {'y': 0.1052, 'x': 0.1425},
          {'y': 0.1425, 'x': 0.1425}, {'y': 0.2414, 'x': 0.1425},
          {'y': 0.2597, 'x': 0.1425}, {'y': 0.2839, 'x': 0.1425},
          {'y': 11.42, 'x': 0.1425},
          {'y':

In [None]:
#https://stats.stackexchange.com/questions/127077/random-forest-probabilistic-prediction-vs-majority-vote