In [1]:
#imports
from __future__ import division
import time
import datetime
import copy
from itertools import product
import operator
from collections import OrderedDict

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier,
    GradientBoostingRegressor,
    RandomForestRegressor,
    GradientBoostingClassifier
from sklearn.metrics import accuracy_score, r2_score

import altair as alt
alt.renderers.enable("notebook")

RendererRegistry.enable('notebook')

In [2]:
class ForestForTheTrees:
    
    def __init__(self):
        
        self.dataset = None
        self.x = None
        self.y = None
        self.feature_names = None
        self.feature_locs = None
        self.target_type = None
        self.classifier_type = None
        self.classifier = None
        self.feature_ranges = {}
        self.mean_prediction = None
        self.offset = None
        
    
    def get_dataset(dataset):
        
        if dataset == "breast cancer":
            dataLoad = datasets.load_breast_cancer(return_X_y=False)
            return {"x": dataBunch.data[:,:10],
                    "y": dataBunch.target,
                    "feature_names": dataBunch.feature_names[:10],
                    "feature_locs": {x:i for i,x in enumerate(dataBunch.feature_names[:10])},
                    "target_type": "Classification"
                   }     
        elif dataset == "cervical cancer":
            dataLoad = pd.read_csv("data/cervical_cancer.csv")
            target = dataLoad.Biopsy
            dataLoad = dataLoad.drop(["Person", "Biopsy"],axis=1)
            return {"x": dataLoad.values,
                    "y": target,
                    "feature_names": dataLoad.columns,
                    "feature_locs": {x:i for i,x in enumerate(dataLoad.columns)},
                    "target_type": "Classification"
                   }
        elif dataset == "bike":
            def _datestr_to_timestamp(s):
                return time.mktime(datetime.datetime.strptime(s, "%Y-%m-%d").timetuple())

            dataLoad = pd.read_csv('data/bike.csv')
            dataLoad['dteday'] = dataLoad['dteday'].apply(_datestr_to_timestamp)
            dataLoad = pd.get_dummies(dataLoad, prefix=["weathersit"], columns=["weathersit"], drop_first=False)

            #de-normalize data to produce human-readable features.
            #Original range info from http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
            dataLoad["hum"] = dataLoad["hum"].apply(lambda x: x*100.)
            dataLoad["windspeed"] = dataLoad["windspeed"].apply(lambda x: x*67.)
            #convert Celsius to Fahrenheit
            dataLoad["temp"] = dataLoad["temp"].apply(lambda x: (x*47. - 8)*9/5 +32)
            dataLoad["atemp"] = dataLoad["atemp"].apply(lambda x: (x*66. - 16)*9/5 + 32)

            #rename features to make them interpretable for novice users
            feature_names_dict = {
                "yr":"First or Second Year", 
                "season":"Season", 
                "hr":"Hour of Day", 
                "workingday":"Work Day",
                "weathersit_2":"Misty Weather",
                "weathersit_3":"Light Precipitation",
                "weathersit_4":"Heavy Precipitation",
                "temp":"Temperature (F)",
                "atemp":"Feels Like (F)",
                "hum":"Humidity",
                "windspeed":"Wind Speed"
            }
            dataLoad = dataLoad.rename(mapper=feature_names_dict,axis=1) 
            features = feature_names_dict.values()

            return {"x": dataLoad[features].values,
                    "y": dataLoad["cnt"],
                    "feature_names": features,
                    "feature_locs": {x:i for i,x in enumerate(features)},
                    "target_type": "Regression"
                   }
        
    def load_dataset(self, dataset):

        self.dataset = dataset
        data = get_dataset(self.dataset)
        self.x = data.x
        self.y = data.y
        self.feature_names = data.feature_names
        self.feature_locs = data.feature_locs
        self.target_type = data.target_type
        
        #get feature quantiles
        for feature in self.feature_names:
            self.feature_ranges[feature] = self.get_quantiles(feature)        
            
    def build_base_model(self, num_estimators, model_type, learning_rate):

        model_lookup_dict = {
            ("classification", "random forest") : RandomForestClassifier,
            ("classification", "gradient boosting") : GradientBoostingClassifier,
            ("regression", "random forest") : RandomForestRegressor,
            ("regression", "gradient boosting") : GradientBoostingRegressor
        }

        self.model_type = model_type
        self.classifier_type = model_lookup_dict[(self.target_type, self.model_type)]

        self.model = self.classifier_type(n_estimators=num_estimators, max_depth=2)
        self.model.fit(self.x, self.y)
        self.pred_y = model.predict(self.x)

    def get_model_accuracy(self):
        if self.target_type == "classification":
            return accuracy_score(self.y, self.pred_y)\
        else:
            return r2_score(self.y, self.pred_y)            

In [4]:
def calculate(self, num_tiles=20, collapse_1d = True, quantiles=False):
    
    def _get_coordinate_matrix(lst, length, direction):
        if direction=="h":
            return lst*length
        else:
            return [item for item in lst\
             for i in range(length)]   

    def get_quantile_matrix(self, feat1, feat2):
        h = _get_coordinate_matrix(
            list(self.feature_ranges[feat1]),
            len(self.feature_ranges[feat2]),
            "h"
        )
        v = _get_coordinate_matrix(
            list(self.feature_ranges[feat2]),
            len(self.feature_ranges[feat1]),
            "v"
        )                      
        return h,v 

    def get_leaf_value(self, model, node_position):
        if self.target_type == "Classification":
            return self.model.tree_.value[node_position][0][1]/\
                        (model.tree_.value[node_position][0][1] + model.tree_.value[node_position][0][0])
        else:
            return model.tree_.value[node_position]
        
    def get_feature_pair_key(self, feat1, feat2):
        if self.feature_ranges[feat1].shape[0] == self.feature_ranges[feat2].shape[0]:
            #need stable order so keys with same number of quantiles appear in only one order
            return tuple(sorted([feat1, feat2]))
        elif self.feature_ranges[feat1].shape[0] > self.feature_ranges[feat2].shape[0]:
            return tuple([feat1, feat2])
        else:
            return tuple([feat2, feat1])
        
    def get_quantiles(self, feat):
        loc = self.feature_locs[feat]
        if np.unique(self.x[:,loc]).shape[0] < 30 or type(self.x[0,loc]) is str: #is categorical/ordinal?
            return np.unique(self.x[:,loc])
        else:
            if quantiles:
                return np.around(
                    np.unique(
                        np.quantile(
                            a=self.x[:,loc],
                            q=np.linspace(0, 1, num_tiles)
                        )
                    ),
                    1)
            else:
                return np.around(
                    np.linspace(
                        np.min(self.x[:,loc]), 
                        np.max(self.x[:,loc]),
                        num_tiles
                    )
                    ,1)

    def reduce_to_1d(self, arr, threshold, direction):
        if direction == "h":
            reduced_arr = arr - arr[:,0].reshape(-1,1)
        else:
            reduced_arr = arr - arr[0,:].reshape(1,-1)
        bol_reduce = (np.max(np.abs(reduced_arr))/np.max(np.abs(arr))) <= threshold
        return bol_reduce     
        
    #generate data structure for pairwise charts
    feature_pairs = {self.get_feature_pair_key(key[0], key[1]) : 
                     {
                         "map":None,
                         "predicates":[]
                     }
                     for key in [tuple(t) for t in product(self.feature_names, repeat=2)]}      
  
    for key, value in feature_pairs.iteritems():
        h, v = self.get_quantile_matrix(key[0], key[1])
        value["map"] = np.array(
            [
                {
                    key[0]:x,
                    key[1]:y
                }
                for x,y in zip(h,v)]).reshape(len(self.feature_ranges[key[1]]), len(self.feature_ranges[key[0]]))
        
    for modelT in self.model.estimators_:
        if self.target_type == "regression":
            model = modelT[0]
        else:
            model = modelT
        feature_ids = {i : {
            "number":x,
            "name":self.feature_names[x]
        } for i,x in enumerate(list(self.model.tree_.feature)) if x>=0} #-2 means leaf node

        #for 1-layer trees
        if self.model.tree_.feature[1] <0:
            print "1-layer tree"
            print key
            feature_pair_key = self.get_feature_pair_key(
                feature_ids[0]["name"],
                feature_ids[0]["name"]
            )
            decision_func_dict = {
                "feature_name": feature_ids[0]["name"],
                "threshold": model.tree_.threshold[0],
                "operator": operator.le,
                "prob_le": get_leaf_value(model,1),
                "prob_gt": get_leaf_value(model,2)
            }       
            #build the predictive function used in the decision tree
            def dt_predicate(data_case, decision_func_dict=decision_func_dict):
                if decision_func_dict["operator"](\
                                                    data_case[decision_func_dict["feature_name"]],\
                                                    decision_func_dict["threshold"]\
                                                   ):
                    return decision_func_dict["prob_le"]
                else:
                    return decision_func_dict["prob_gt"]        
        else:
            for node_position in [1,4]: #positions for left and right nodes at layer 2
                if node_position in feature_ids:
                    feature_pair_key = get_feature_pair_key(feature_ids[0]["name"], feature_ids[node_position]["name"])
                    #get the decision rules
                    decision_func_dict = {
                        "feature_name_1": feature_ids[0]["name"],
                        "threshold_1": model.tree_.threshold[0],

                        "operator_1": operator.le if node_position == 1 else operator.gt,

                        "feature_name_2": feature_ids[node_position]["name"],
                        "threshold_2": model.tree_.threshold[node_position],

                        "operator_2": operator.le,

                        "prob_le": get_leaf_value(model,node_position+1),

                        "prob_gt": get_leaf_value(model,node_position+2)
                    }
                    #print decision_func_dict
                    #build the predictive function used in the decision tree
                    def dt_predicate(data_case, decision_func_dict=decision_func_dict):
                        #print data_case
                        #print decision_func_dict
                        if decision_func_dict["operator_1"](\
                                                            data_case[decision_func_dict["feature_name_1"]],\
                                                            decision_func_dict["threshold_1"]\
                                                           ):
                            #print "in1"
                            if decision_func_dict["operator_2"](\
                                                                data_case[decision_func_dict["feature_name_2"]],\
                                                                decision_func_dict["threshold_2"]\
                                                               ):
                                #print "in2"
                                return decision_func_dict["prob_le"]
                            else:
                                #print "in3"
                                return decision_func_dict["prob_gt"]
                        else:
                            return 0.

                else: #asymmetric tree, this is a leaf node
                    feature_pair_key = get_feature_pair_key(feature_ids[0]["name"], feature_ids[0]["name"])
                    decision_func_dict = {
                        "feature_name": feature_ids[0]["name"],
                        "threshold": model.tree_.threshold[0],
                        "operator": operator.le if node_position == 1 else operator.gt,
                        "prob": model.tree_.value[node_position]
                    }
                    #build the predictive function used in the decision tree
                    def dt_predicate(data_case, decision_func_dict=decision_func_dict):
                        #print data_case
                        if decision_func_dict["operator"](\
                                                            data_case[decision_func_dict["feature_name"]],\
                                                            decision_func_dict["threshold"]\
                                                           ):
                            return decision_func_dict["prob"]
                        else:                         
                            return 0.                 

                feature_pairs[feature_pair_key]["predicates"].append(dt_predicate)
            
    #now calculate output array for each feature pair
    for key, value in feature_pairs.iteritems():
        arrs = []
        for predicate in value["predicates"]:
            f = np.vectorize(predicate)
            arrs.append(f(value["map"]))
        if len(arrs) > 0:
            #details of vote aggreggation method for random forest
            #https://stats.stackexchange.com/questions/127077/random-forest-probabilistic-prediction-vs-majority-vote
            value["output"] = np.sum(np.stack(arrs, axis=-1), axis=-1)
        else:
            value["output"] = None
            
    #build chart data
    for key, value in feature_pairs.iteritems():
        h,v = get_quantile_matrix(key[0], key[1])
        value["H_Indices"] = h
        value["V_Indices"] = v
        #value["Votes"] = value["output"].ravel() if value["output"] is not None else None       
        
    output_details = {"offset":0.,
                      "no_predictor_features":[],
                      "1d_features":[],
                      "dropped_features":[]}
    chart_data = {}
    for key, value in feature_pairs.iteritems(): 
        newKey = key
        if value["output"] is None:
            output_details["no_predictor_features"].append(key)
            value["removed"] = True
        else:          
            if collapse_1d:
                if reduce_to_1d(value["output"], 0., "v"):
                    newKey = key[1]
                    value["output"] = value["output"][0,:]
                    value["H_Indices"] = feature_ranges[newKey]
                    value["V_Indices"] = None
                    value["1d_key"] = newKey
                    value["removed"] = True
                    output_details["1d_features"].append(key)                 
                elif reduce_to_1d(value["output"], 0., "h"):
                    newKey = key[0]
                    value["output"] = value["output"][:,0]
                    value["H_Indices"] = feature_ranges[newKey]
                    value["V_Indices"] = None
                    value["1d_key"] = newKey
                    value["removed"] = True
                    output_details["1d_features"].append(key)

            #subtract mean from votes to center at zero
            vote_mean = value["output"].mean()
            output_details["offset"] += vote_mean
            value["output"] = value["output"] - vote_mean

                    
    #do another loop through chart_data to push 1d charts into 2d
    if collapse_1d:
        for value in feature_pairs.itervalues():
            if value["V_Indices"] is None:
                key = value["1d_key"]
                #print key, value["output"]
                #get list of charts with this feature
                matchList = sorted([{"key": kInner, "feature_importance": np.std(vInner["output"])}\
                                    for kInner, vInner in feature_pairs.iteritems()\
                                    if "removed" not in vInner and key in kInner],\
                                   key=lambda x: x["feature_importance"], reverse=True)
                #print matchList
                if len(matchList) > 0:
                    #value["removed"] = True
                    matchKey = matchList[0]["key"]
                    #match_length = feature_pairs[matchKey]["output"].shape[0]
                    #print match_length
                    #print feature_pairs[matchKey]["output"]
                    feature_pairs[matchKey]["output"] = feature_pairs[matchKey]["output"]\
                    + value["output"].reshape(\
                                              -1 if key==matchKey[1] else 1,\
                                              -1 if key==matchKey[0] else 1\
                                             )
    
    #one last loop to generate the horizontal and vertical components
    for key, value in feature_pairs.iteritems():
        if "removed" in value:
            pass
        else:
            value["output_H"] = np.mean(value["output"], axis=1).reshape(-1,1)
            value["output_V"] = np.mean(value["output"], axis=0).reshape(1,-1)
            value["output_HReduced"] = np.mean(value["output"] - value["output_V"].reshape(1,-1), axis=1)\
            .reshape(1,-1)
            value["output_VReduced"] = np.mean(value["output"] - value["output_H"].reshape(-1,1), axis=0)\
            .reshape(-1,1)

    #remove deleted keys
    feature_pairs = {key:val for key, val in feature_pairs.iteritems() if "removed" not in val}
    feauture_pairs = OrderedDict(sorted(feature_pairs.items(),\
                                        key=lambda x: np.std(x[1]["output"]), reverse=True))
    return feature_pairs, feature_ranges, output_details

In [5]:
def generate_chart(data, raw_data, scheme="redyellowblue", plot_points=True, fields_subset=None):
    i = 1
    rows = []
    charts = []
    for key, value in data.iteritems():
        
        if fields_subset is None or key in fields_subset:
            
            if value["dimension"] == 2:
                y_encoding = alt.Y(field="V_Indices",
                                   type="ordinal", sort="descending",
                                   axis=alt.Axis(title=key[1]))                

            x_encoding = alt.X(field="H_Indices",
                               type="ordinal", sort="ascending",
                               axis=alt.Axis(title=key[0] if value["dimension"] == 2 else key))

            color_encoding = alt.Color(field= "Votes",
                                       type="quantitative",
                                       scale=alt.Scale(scheme=scheme),
                                       legend=alt.Legend(title="Votes"))
            chart = alt.Chart(data=value["df"]).mark_rect()
            if value["dimension"] == 1:
                chart = chart.encode(x=x_encoding, color=color_encoding)\
                    .properties(width=150, height=20)
            else:
                 chart = chart.encode(x=x_encoding, y=y_encoding, color=color_encoding)\
                    .properties(width=150, height=150)

            if plot_points:
                df = pd.DataFrame(raw_data["data"][np.random.choice(raw_data["data"].shape[0],300,replace=False),:],\
                                  columns=raw_data["feature_names"])
                points = alt.Chart(df).mark_circle(
                    color='black',
                    size=5
                ).encode(
                    x=alt.X(field=key[0],type="quantitative", sort="ascending", axis=None),
                    y=alt.X(field=key[1],type="quantitative", sort="ascending", axis=None)
                ).properties(width=150, height=150)
                chart = chart + points

            charts.append(chart)
        if len(charts)==4 or i==len(data):
            rows.append(alt.hconcat(*charts))
            charts = []
        i += 1
    return alt.vconcat(*rows)

In [132]:
def evaluate(data, 
             raw_data, 
             feature_ranges, 
             classifier, 
             chart_details, 
             sample_size,
             fidelity_threshold=1., 
             rollup=None
            ):
    sample_offset = 100
    def get_predictions_base():
        return np.full((sample_size,1),chart_details["offset"] + np.mean(raw_data["target"]))
    
    def get_explanation_accuracy(explanation_predictions):
        if raw_data["target_type"] == "Regression":
            return r2_score(model_predictions, explanation_predictions)
        
    def get_prediction_contributions(chart, data_positions):
        return np.take(chart, data_positions)   
    
    def sum_arrays(arrMain, arrAdd, horizontal, keyMain, keyAdd):
        #print arrMain.shape
        #print arrAdd.shape
        return arrMain + arrAdd.reshape(arrMain.shape[0] if\
                                        (keyMain[1]==keyAdd[1] or keyMain[1]==keyAdd[0]) else 1, -1)
    
    def bin_data():
    
        prediction_contributions = pd.DataFrame(
            raw_data["data"][sample_offset:sample_offset+sample_size,:],
            columns=raw_data["feature_names"]
        )
        for key, value in data.iteritems():
            prediction_contributions["tempH"] = np.digitize(prediction_contributions.loc[:,key[0]],
                                                            feature_ranges[key[0]])-1.
            prediction_contributions["tempV"] = np.digitize(prediction_contributions.loc[:,key[1]],
                                                            feature_ranges[key[1]])-1.
            prediction_contributions[key] = prediction_contributions.apply(lambda x:\
                                                    int(x["tempV"]*len(feature_ranges[key[0]]) + x["tempH"]),\
                                                    axis=1)
        return prediction_contributions.loc[:,[key for key in data.iterkeys()]]

    def evaluate_single_explanation(explanation):
        prediction_contributions = bin_data()
        model_predictions = classifier.predict(raw_data["data"][sample_offset:sample_offset+sample_size,:])
        
        chart_values = {key: {
            "output" : val["output"],
            "output_VReduced" : val["output_VReduced"],
            "output_H" : val["output_H"],
            "output_HReduced" : val["output_HReduced"],
            "output_V" : val["output_V"]
        } for key, val in data.iteritems()}
        
        get_explanation_accuracy(
            get_predictions_base() +\
            np.sum(np.array([
                get_prediction_contributions(
                    temp_outputs[expKey]["output"],
                    prediction_contributions[expKey])
                for expKey in keysInCurrentExplanation]),
                   axis=0).reshape(-1,1)
        )
        
    explanation = []
    evaluation_details = {}        
    evaluation_details[0] = {"score": get_explanation_accuracy(base_predictions)}    
    
    i = 1
    while fidelity < fidelity_threshold and len(explanation) < len(data):
        evaluation_details[i] = {}
        keys_to_evaluate = [key for key in data.iterkeys() if key not in explanation]
        for key in keys_to_evaluate:
            #print "key to evaluate: " + str(key)
            #roll up other keys
            keysInCurrentExplanation = explanation+[key]
            temp_outputs = copy.deepcopy(chart_values)
            if rollup == "advanced":
                for keyRollup in [k for k in keys_to_evaluate if k != key]:
                    #print "rollup: " + str(keyRollup)
                    hUsed = False
                    vUsed = False
                    for keyExisting in keysInCurrentExplanation:
                        #print "try against: " + str(keyExisting)
                        if (keyRollup[1] == keyExisting[0] or keyRollup[1] == keyExisting[1]) and not hUsed:
                            hUsed = True
                            if vUsed:
                                #print "HReduce"
                                temp_outputs[keyExisting]["output"] = sum_arrays(
                                    temp_outputs[keyExisting]["output"],
                                    temp_outputs[keyRollup]["output_HReduced"],
                                    True, 
                                    keyExisting, 
                                    keyRollup
                                )
                                break
                            else:
                                #print "HAll"
                                temp_outputs[keyExisting]["output"] = sum_arrays(
                                    temp_outputs[keyExisting]["output"],
                                    temp_outputs[keyRollup]["output_H"],
                                    True, 
                                    keyExisting, 
                                    keyRollup
                                )                           
                        elif (keyRollup[0] == keyExisting[0] or keyRollup[0] == keyExisting[1]) and not vUsed:
                            vUsed = True
                            if hUsed:
                                #print "VReduce"
                                temp_outputs[keyExisting]["output"] = sum_arrays(
                                    temp_outputs[keyExisting]["output"],
                                    temp_outputs[keyRollup]["output_VReduced"],
                                    False, 
                                    keyExisting, 
                                    keyRollup
                                )                          
                                break
                            else:
                                #print "VAll"
                                temp_outputs[keyExisting]["output"] = sum_arrays(
                                    temp_outputs[keyExisting]["output"],
                                    temp_outputs[keyRollup]["output_V"],
                                    False, 
                                    keyExisting, 
                                    keyRollup
                                )
    
            evaluation_details[i][key] =\
            get_explanation_accuracy(base_predictions +\
                                     np.sum(np.array([\
                                                      get_prediction_contributions(temp_outputs[expKey]["output"],
                                                                                  prediction_contributions[expKey])\
                                                     for expKey in keysInCurrentExplanation]),\
                                            axis=0).reshape(-1,1)\
                                    )
            
        #get key with highest fidelity score
        best_key = max(evaluation_details[i].iterkeys(),\
            key=(lambda key: evaluation_details[i][key]))
        explanation.append(best_key)
        evaluation_details[i]["best_key"] = best_key
        
        if rollup == "simple":
            #roll up other keys
            temp_outputs = copy.deepcopy(chart_values)
            for keyRollup in [k for k in data.keys() if k not in explanation]:
                #print "rollup: " + str(keyRollup)
                hUsed = False
                vUsed = False
                for keyExisting in explanation:
                    #print "try against: " + str(keyExisting)
                    if (keyRollup[1] == keyExisting[0] or keyRollup[1] == keyExisting[1]) and not hUsed:
                        hUsed = True
                        if vUsed:
                            #print "HReduce"
                            temp_outputs[keyExisting]["output"] = sum_arrays(
                                temp_outputs[keyExisting]["output"],
                                temp_outputs[keyRollup]["output_HReduced"],
                                True, 
                                keyExisting, 
                                keyRollup
                            )
                            break
                        else:
                            #print "HAll"
                            temp_outputs[keyExisting]["output"] = sum_arrays(
                                temp_outputs[keyExisting]["output"],
                                temp_outputs[keyRollup]["output_H"],
                                True, 
                                keyExisting, 
                                keyRollup
                            )                           
                    elif (keyRollup[0] == keyExisting[0] or keyRollup[0] == keyExisting[1]) and not vUsed:
                        vUsed = True
                        if hUsed:
                            #print "VReduce"
                            temp_outputs[keyExisting]["output"] = sum_arrays(
                                temp_outputs[keyExisting]["output"],
                                temp_outputs[keyRollup]["output_VReduced"],
                                False, 
                                keyExisting, 
                                keyRollup
                            )                          
                            break
                        else:
                            #print "VAll"
                            temp_outputs[keyExisting]["output"] = sum_arrays(
                                temp_outputs[keyExisting]["output"],
                                temp_outputs[keyRollup]["output_V"],
                                False, 
                                keyExisting, 
                                keyRollup
                            )        

            evaluation_details[i]["score"] =\
            get_explanation_accuracy(base_predictions +\
                                     np.sum(np.array([\
                                                      get_prediction_contributions(temp_outputs[expKey]["output"],
                                                                                  prediction_contributions[expKey])\
                                                     for expKey in explanation]),\
                                            axis=0).reshape(-1,1)\
                                    )
        else:
            evaluation_details[i]["score"] = evaluation_details[i][best_key]
        
        fidelity = evaluation_details[i]["score"]
        print  "*******"
        print i
        print best_key
        print evaluation_details[i]["score"]
        i += 1
            
    
    return explanation, evaluation_details

In [113]:
data_to_use = load_data("bike")
classifier, accuracy = build_classifier(data_to_use, 100, "gradient boosting", 1.0)
chart_data, feature_ranges, output_details = calculate(data_to_use, classifier, 100, True)
#generate_chart(chart_data, data_to_use)
explanation, _ = evaluate(chart_data, data_to_use, feature_ranges, classifier, output_details, 5000, .95, "advanced")

*******
1
('Temperature (F)', 'Hour of Day')
0.7285758589470361
*******
2
('Hour of Day', 'Work Day')
0.8732662910184066
*******
3
('Feels Like (F)', 'Hour of Day')
0.9098027166722784
*******
4
('Hour of Day', 'Season')
0.9288196703317801
*******
5
('Feels Like (F)', 'Light Precipitation')
0.93330500421946
*******
6
('Feels Like (F)', 'Temperature (F)')
0.9386753598101099
*******
7
('Feels Like (F)', 'Work Day')
0.9415854467993299
*******
8
('Temperature (F)', 'Work Day')
0.9425377247731114
*******
9
('Season', 'Light Precipitation')
0.9440507978997703
*******
10
('Temperature (F)', 'Misty Weather')
0.9443240168286298
*******
11
('Feels Like (F)', 'Misty Weather')
0.9445870087732237
*******
12
('Feels Like (F)', 'Season')
0.9446712018066583
*******
13
('Season', 'Misty Weather')
0.9445262729411412
*******
14
('Hour of Day', 'Misty Weather')
0.9438960290581635
*******
15
('Hour of Day', 'Light Precipitation')
0.9278034987690513
*******
16
('Hour of Day', 'First or Second Year')
0.921547

In [114]:
data_to_use = load_data("bike")
classifier, accuracy = build_classifier(data_to_use, 100, "gradient boosting", 1.0)
chart_data, feature_ranges, output_details = calculate(data_to_use, classifier, 100, True)
#generate_chart(chart_data, data_to_use)
explanation, _ = evaluate(chart_data, data_to_use, feature_ranges, classifier, output_details, 5000, .95, None)

*******
1
('Temperature (F)', 'Hour of Day')
0.6388493180214152
*******
2
('Hour of Day', 'Work Day')
0.7470833050834247
*******
3
('Feels Like (F)', 'Hour of Day')
0.8359935497942745
*******
4
('Hour of Day', 'First or Second Year')
0.8756325672500528
*******
5
('Hour of Day', 'Light Precipitation')
0.9390950406634805
*******
6
('Humidity', 'Hour of Day')
0.9430382677976129
*******
7
('Hour of Day', 'Season')
0.9475764882968291
*******
8
('Feels Like (F)', 'Wind Speed')
0.955816726136186


In [133]:
#data_to_use = load_data("bike")
#classifier, accuracy = build_classifier(data_to_use, 100, "gradient boosting", 1.0)
#chart_data, feature_ranges, output_details = calculate(data_to_use, classifier, 100, True)
#generate_chart(chart_data, data_to_use)
explanation, _ = evaluate(chart_data, data_to_use, feature_ranges, classifier, output_details, 5000, .95, "simple")

*******
1
('Temperature (F)', 'Hour of Day')
0.7285758589470364
*******
2
('Hour of Day', 'Work Day')
0.873266291018407
*******
3
('Feels Like (F)', 'Hour of Day')
0.9098027166722783
*******
4
('Hour of Day', 'First or Second Year')
0.8552949201373392
*******
5
('Hour of Day', 'Light Precipitation')
0.9216284155559572
*******
6
('Humidity', 'Hour of Day')
0.9314859629111633
*******
7
('Hour of Day', 'Season')
0.9368087989378879
*******
8
('Feels Like (F)', 'Wind Speed')
0.9432563360125976
*******
9
('Humidity', 'Light Precipitation')
0.9462162399169958
*******
10
('Feels Like (F)', 'Work Day')
0.9497786473664296
*******
11
('Feels Like (F)', 'Misty Weather')
0.949079932793873
*******
12
('Humidity', 'Season')
0.9510021779014951


In [None]:
generate_chart(chart_data, data_to_use, fields_subset = explanation)

In [None]:
generate_chart(chart_data, data_to_use)