# Decision Trees

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X, y)
tree_clf.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [2]:
from sklearn.tree import export_graphviz
# import graphviz 

tree_data = export_graphviz(
    tree_clf,
    feature_names = iris.feature_names[2:],
    class_names = iris.target_names,
    rounded=True,
    filled=True
)

# graph = graphviz.Source(tree_data)  



In [3]:
tree_clf.predict_proba([[5, 1.5]])

array([[ 0.,  0.,  1.]])

In [4]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [5]:
from sklearn.datasets import make_moons
moon_data = make_moons(n_samples = 10000, noise=0.4)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(moon_data[0],moon_data[1], test_size=0.2)

In [7]:
from sklearn.model_selection import GridSearchCV

tree_clf = DecisionTreeClassifier()

param_grid = {
    'max_depth': [2,4,6,8,10],
    'max_leaf_nodes': [None, 5, 10, 15, 20, 30],
    'criterion': ['gini', 'entropy'],
}
grid_search = GridSearchCV(tree_clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 4, 6, 8, 10], 'max_leaf_nodes': [None, 5, 10, 15, 20, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 6, 'max_leaf_nodes': 20}

In [9]:
from sklearn.metrics import accuracy_score
predictions = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
accuracy

0.86150000000000004

In [26]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=1000, train_size=100, test_size=0)
decision_tree_models = [DecisionTreeClassifier(max_depth=2).fit(X_train[train_index[0]],y_train[train_index[0]]) 
                        for train_index in rs.split(X_train)]

In [27]:
evaluate_accuracies = [accuracy_score(y_test, model.predict(X_test)) for model in decision_tree_models]
evaluate_accuracies

[0.85550000000000004,
 0.84199999999999997,
 0.81999999999999995,
 0.80200000000000005,
 0.85599999999999998,
 0.81100000000000005,
 0.84850000000000003,
 0.84050000000000002,
 0.75900000000000001,
 0.77700000000000002,
 0.80249999999999999,
 0.84450000000000003,
 0.748,
 0.84299999999999997,
 0.73850000000000005,
 0.80200000000000005,
 0.80100000000000005,
 0.80900000000000005,
 0.85199999999999998,
 0.81000000000000005,
 0.83599999999999997,
 0.80649999999999999,
 0.85199999999999998,
 0.84799999999999998,
 0.85699999999999998,
 0.81850000000000001,
 0.81850000000000001,
 0.79000000000000004,
 0.83399999999999996,
 0.84350000000000003,
 0.83850000000000002,
 0.85050000000000003,
 0.83550000000000002,
 0.84250000000000003,
 0.84699999999999998,
 0.8095,
 0.79700000000000004,
 0.85050000000000003,
 0.84499999999999997,
 0.84099999999999997,
 0.85299999999999998,
 0.83399999999999996,
 0.79849999999999999,
 0.76749999999999996,
 0.84450000000000003,
 0.84250000000000003,
 0.854999999999

In [28]:
import numpy as np
print(np.average(evaluate_accuracies))

0.8209


In [29]:
from scipy import stats

# mode_aggregator = lambda x: stats.mode([model.predict(x.reshape(1, -1)) for model in decision_tree_models])
mode_aggregator_2 = lambda x: sum([model.predict(x.reshape(1, -1)) for model in decision_tree_models]) / len(decision_tree_models) >= 0.5
forest_predictions = list(map(mode_aggregator_2, X_test))
    
print(accuracy_score(y_test, forest_predictions))

0.861
