In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import importanceMatrixReg
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor###
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score###
from sklearn.preprocessing import scale
from sklearn.datasets import load_boston###
from matplotlib import pyplot as plt

In [2]:
#Import boston housing data
proj_path = os.path.dirname(os.getcwd())

boston = load_boston()
boston.feature_names
feature_names = boston.feature_names

In [3]:
X = boston.data
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=45)

# train the regressor
rf_clf = RandomForestRegressor(random_state=45, n_estimators=100)
rf_clf.fit(X_train, y_train)

# make predictions and calculate the MAE error
predictions = rf_clf.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print('Mean Absolute Error: {}'.format(round(mae, 2)))
print('Mean Squared Error: {}'.format(round(mse, 2))) 
print('Coefficient of Determination: {}'.format(round(r2, 2))) 

Mean Absolute Error: 2.43
Mean Squared Error: 11.1
Coefficient of Determination: 0.89


In [4]:
###THIS CELL WAS USED TO DEBUG THE CODE
def f_gini(v):
    p = v/v.sum()
    return np.multiply(p, 1-p).sum()

def f_entropy(v):
    return sps.entropy(pk=v)

def f_misclassification(v):
    p = v/v.sum()
    return 1 - p.max()

def f_mse(v):
    return ((v-v.mean())**2).sum()/len(v)

### AGGIUNGERE MSE=MEAN SQUARE ERROR; FRIEDMAN_MSE; MAE = MEAN ABSOLUTE ERROR.

# get the number of classes being predicted by the random forest
###classes = rf_clf.classes_
###n_classes = len(classes)
###print(n_classes, classes)
# init storage for the predictor importances by classes by trees
importance_matrix = []

dec_tree = rf_clf.estimators_[0]


for dec_tree in rf_clf.estimators_:

    # get the criterion used to measure impurity
    criterion = dec_tree.get_params()['criterion']
    if criterion == 'gini':
        f_impurity = f_gini
    elif criterion == 'entropy':
        f_impurity = f_entropy
    elif criterion == 'misclassification':
        f_impurity = f_misclassification
    elif criterion == 'mse':
            f_impurity = f_mse
    else:
        f_impurity = 0
        print('Unassigned impurity measure')

    # get the number of features and nodes in the tree
    feature = dec_tree.tree_.feature
    n_features = dec_tree.tree_.n_features
    n_nodes = dec_tree.tree_.__getstate__()['node_count']
    nodes = dec_tree.tree_.__getstate__()['nodes']
    parent_node_ind = -np.ones(shape=n_nodes, dtype='<i8')
    #parent_node_ind[0] = n_nodes + 1
    #print(parent_node_ind)
    for par_ind,node in enumerate(nodes):
        if node[0] != -1:
            parent_node_ind[node[0]] = par_ind
        if node[1] != -1:
            parent_node_ind[node[1]] = par_ind
    #print(parent_node_ind)

    # identify the leaves of the tree
    is_leaves = np.array([node[0]==-1 and node[1]==-1 for node in nodes])

    leaves_index = np.nonzero(is_leaves)[0]



    values_sorted = dec_tree.tree_.__getstate__()['values']
    #print ('nodes', nodes, len(nodes), len(values_sorted[:,0,:]))
    node_pred = np.argmax(values_sorted[:,0,:], axis=1)
    leaves_class_index = node_pred[is_leaves]

    for par_ind,node in enumerate(nodes):
        print(par_ind,parent_node_ind[par_ind],is_leaves[par_ind],node,
              values_sorted[par_ind], values_sorted[par_ind].sum())
    
    ### TO BE SANITY-CHECKED
    node_unvisited = np.ones((n_classes, n_nodes), dtype=bool)
    tree_importances = np.zeros((n_classes, n_features))
    for leaf_i,leaf_c_i in zip(leaves_index,leaves_class_index):
        ###v1_initialisation###parent_i = parent_node_ind[leaf_i]
        ###v1_initialisation###current_i = leaf_i
        current_i = parent_node_ind[leaf_i]
        print('START from leaf ', leaf_i, 'with class ', leaf_c_i)
        print('whose parent is ', current_i)
        # walk the tree and calculate the importance of the predictor
        ###v1_initialisation###while parent_i != -1 and node_unvisited[leaf_c_i,current_i]:
        while current_i != -1 and node_unvisited[leaf_c_i,current_i]:
            current_node = nodes[current_i]
            left_node = nodes[current_node['left_child']]
            right_node = nodes[current_node['right_child']]
            current_feature = current_node['feature']
            #tree_importances[leaf_c_i,current_feature] += (
            #        current_node['weighted_n_node_samples'] * current_node['impurity'] -
            #        left_node['weighted_n_node_samples'] * left_node['impurity'] -
            #        right_node['weighted_n_node_samples'] * right_node['impurity']
            #        )
            ###NEW HERE
            current_values = values_sorted[current_i,0,:]
            left_values = values_sorted[current_node['left_child'],0,:]
            right_values = values_sorted[current_node['right_child'],0,:]

            current_values_class = np.array([
                current_values[leaf_c_i],
                current_values[np.arange(len(current_values)) != leaf_c_i].sum()
            ])
            left_values_class = np.array([
                left_values[leaf_c_i],
                left_values[np.arange(len(left_values)) != leaf_c_i].sum()
            ])
            right_values_class = np.array([
                right_values[leaf_c_i],
                right_values[np.arange(len(right_values)) != leaf_c_i].sum()
            ])
            print(
                current_values,
                np.array([current_values[leaf_c_i], current_values[np.arange(len(current_values))!=leaf_c_i].sum()])
                 )
            #print(current_values.sum(), left_values.sum(), right_values.sum(),
            #     left_values.sum()/current_values.sum(), right_values.sum()/current_values.sum(),
            #      current_node['weighted_n_node_samples'], left_node['weighted_n_node_samples'],
            #      right_node['weighted_n_node_samples']
            #     )
            tree_importances[leaf_c_i,current_feature] += (
                    current_node['weighted_n_node_samples'] * f_impurity(current_values_class) -
                    left_node['weighted_n_node_samples'] * f_impurity(left_values_class) -
                    right_node['weighted_n_node_samples'] * f_impurity(right_values_class)
                    )
            #print('\n', current_node, (
            #        current_node['weighted_n_node_samples'] * f_importance(current_values) -
            #        left_node['weighted_n_node_samples'] * f_importance(left_values) -
            #        right_node['weighted_n_node_samples'] * f_importance(right_values)
            #        ))
            ###
            node_unvisited[leaf_c_i,current_i] = False
            ###v1###current_i = parent_i
            ###v1###parent_i = parent_node_ind[current_i]
            current_i = parent_node_ind[current_i]
            print('next current is ', current_i)
    importance_matrix.append(tree_importances/nodes[0]['weighted_n_node_samples'])

# average the predictor importances for each class by all of the trees in the forest
importance_matrix = np.mean(importance_matrix, axis = 0)

0 -1 False (1, 174, 12, 9.84500027, 73.47926581, 227, 379.) [[22.72559367]] 22.725593667546175
1 0 False (2, 123, 5, 6.90199995, 67.40315883, 96, 164.) [[29.45853659]] 29.45853658536586
2 1 False (3, 4, 7, 1.57754999, 44.07157334, 69, 113.) [[25.99557522]] 25.995575221238926
3 2 True (-1, -1, -2, -2., 0., 4, 6.) [[50.]] 50.0
4 2 False (5, 74, 5, 6.48149991, 12.42006988, 65, 107.) [[24.64953271]] 24.649532710280365
5 4 False (6, 29, 5, 6.06200004, 5.52067898, 38, 62.) [[22.78870968]] 22.788709677419348
6 5 False (7, 12, 10, 17.54999924, 2.22266529, 13, 22.) [[20.67727273]] 20.677272727272726
7 6 False (8, 11, 1, 50.5, 1.041875, 3, 4.) [[22.575]] 22.574999999999996
8 7 False (9, 10, 6, 45.04999828, 0.14222222, 2, 3.) [[23.13333333]] 23.133333333333336
9 8 True (-1, -1, -2, -2., 0., 1, 2.) [[23.4]] 23.4
10 8 True (-1, -1, -2, -2., -3.97903932e-13, 1, 1.) [[22.6]] 22.6
11 7 True (-1, -1, -2, -2., -2.84217094e-13, 1, 1.) [[20.9]] 20.9
12 6 False (13, 28, 7, 8.6041503, 1.50691358, 10, 18.) [

159 155 True (-1, -1, -2, -2., -2.27373675e-13, 2, 3.) [[35.4]] 35.4
160 150 False (161, 166, 5, 7.17000008, 1.16138889, 6, 12.) [[36.61666667]] 36.61666666666667
161 160 False (162, 165, 5, 7.15149999, 0.07959184, 3, 7.) [[37.45714286]] 37.457142857142856
162 161 False (163, 164, 5, 7.06399989, 0.0016, 2, 5.) [[37.28]] 37.279999999999994
163 162 True (-1, -1, -2, -2., 0., 1, 1.) [[37.2]] 37.2
164 162 True (-1, -1, -2, -2., 0., 1, 4.) [[37.3]] 37.3
165 161 True (-1, -1, -2, -2., -2.27373675e-13, 1, 2.) [[37.9]] 37.9
166 160 False (167, 170, 12, 6.75499988, 0.3024, 3, 5.) [[35.44]] 35.44
167 166 False (168, 169, 6, 45.35000038, 0.02, 2, 3.) [[35.]] 35.0
168 167 True (-1, -1, -2, -2., 0., 1, 2.) [[34.9]] 34.9
169 167 True (-1, -1, -2, -2., 2.27373675e-13, 1, 1.) [[35.2]] 35.2
170 166 True (-1, -1, -2, -2., -4.54747351e-13, 1, 2.) [[36.1]] 36.1
171 149 False (172, 173, 1, 11., 0.1875, 2, 4.) [[43.55]] 43.55
172 171 True (-1, -1, -2, -2., 4.54747351e-13, 1, 3.) [[43.8]] 43.79999999999999
1

296 295 True (-1, -1, -2, -2., 0., 1, 2.) [[19.6]] 19.6
297 295 True (-1, -1, -2, -2., -5.68434189e-14, 1, 3.) [[19.2]] 19.2
298 290 False (299, 304, 11, 393.58000183, 0.26484375, 4, 8.) [[19.9375]] 19.937500000000004
299 298 False (300, 303, 11, 390.19000244, 0.0336, 3, 5.) [[20.32]] 20.32
300 299 False (301, 302, 4, 0.47549999, 0.00222222, 2, 3.) [[20.46666667]] 20.466666666666665
301 300 True (-1, -1, -2, -2., 0., 1, 2.) [[20.5]] 20.5
302 300 True (-1, -1, -2, -2., -1.13686838e-13, 1, 1.) [[20.4]] 20.4
303 299 True (-1, -1, -2, -2., -5.68434189e-14, 1, 2.) [[20.1]] 20.1
304 298 True (-1, -1, -2, -2., 5.68434189e-14, 1, 3.) [[19.3]] 19.3
305 289 True (-1, -1, -2, -2., -1.10844667e-12, 1, 1.) [[15.3]] 15.3
306 288 False (307, 310, 7, 2.08714998, 0.39484375, 5, 8.) [[22.5625]] 22.5625
307 306 False (308, 309, 12, 14.05000019, 0.04, 2, 2.) [[21.7]] 21.7
308 307 True (-1, -1, -2, -2., 0., 1, 1.) [[21.9]] 21.9
309 307 True (-1, -1, -2, -2., -5.68434189e-14, 1, 1.) [[21.5]] 21.5
310 306 Fa

NameError: name 'n_classes' is not defined

In [8]:
dec_tree.tree_

<sklearn.tree._tree.Tree at 0x7f96913f25e0>