# Decision tree on moons dataset

In [1]:
from sklearn.datasets import make_moons

data = make_moons(n_samples=10000, noise=0.4)
data

(array([[ 1.69186681,  0.92000982],
        [ 1.02939886,  1.2306858 ],
        [-0.6879181 ,  0.68562509],
        ...,
        [ 1.14706443,  0.1748809 ],
        [ 0.41632218,  0.16419629],
        [ 0.2413514 ,  0.9435107 ]]),
 array([1, 0, 0, ..., 1, 0, 0]))

In [2]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data[0], data[1])
print(x_test.shape)
print(x_train.shape)

(2500, 2)
(7500, 2)


In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42)
params = {'max_depth': [3,4,5,6,7,10,12,15,17], 'max_leaf_nodes': [2,3,4,5,10,15,20], 'min_samples_split': [2,3,4,5,6], }
grid = GridSearchCV(tree, params)
grid.fit(x_train, y_train)
grid.cv_results_

{'mean_fit_time': array([0.00395103, 0.00381408, 0.00369668, 0.00369267, 0.0040473 ,
        0.00478964, 0.00484967, 0.00481663, 0.00471306, 0.0045013 ,
        0.00514216, 0.00519447, 0.00506544, 0.00504794, 0.00497017,
        0.00505133, 0.0050271 , 0.00506501, 0.00504794, 0.0050539 ,
        0.00503159, 0.00504317, 0.00505791, 0.00504293, 0.00507331,
        0.00506582, 0.00507188, 0.00502954, 0.00508032, 0.00504575,
        0.00505338, 0.00508385, 0.00505562, 0.00514436, 0.00500851,
        0.00393901, 0.00373673, 0.00373034, 0.00374947, 0.00374665,
        0.00452046, 0.00441008, 0.0044754 , 0.00472851, 0.00465608,
        0.00509286, 0.00506082, 0.00505552, 0.00515218, 0.00511551,
        0.00562801, 0.00567083, 0.005721  , 0.00562158, 0.00563669,
        0.00623794, 0.00625658, 0.00623531, 0.00625172, 0.00633631,
        0.00624671, 0.00626636, 0.00629406, 0.00625014, 0.00634642,
        0.00628581, 0.00635333, 0.00630212, 0.0062592 , 0.00631142,
        0.00372992, 0.00369654,

In [4]:
grid.best_params_

{'max_depth': 7, 'max_leaf_nodes': 20, 'min_samples_split': 2}

In [5]:
decision_tree = DecisionTreeClassifier(max_depth=3, max_leaf_nodes=4, min_samples_split=2, random_state=42)
decision_tree.fit(x_train, y_train)
decision_tree.score(x_train, y_train)

0.8533333333333334

## Test on test set

In [6]:
decision_tree.score(x_test, y_test)

0.8504

# Apply random forest

In [7]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits = 1000, train_size=0.01)
splits = ss.split(data[0], data[1])
forest = []
for split in splits:
    X = data[0][split[0]]
    y = data[1][split[0]]
    tree = DecisionTreeClassifier(max_depth=3, max_leaf_nodes=4, min_samples_split=2)
    tree.fit(X, y)
    forest.append(tree)

## Test trees in forest

In [8]:
scores = 0

for tree in forest:
    scores += tree.score(x_test, y_test)
    
print(scores / len(forest))

0.8256480000000004


## Test random forest as a whole

In [9]:
import numpy as np

forest_preds = np.empty((len(forest), x_test.shape[0]))

for i, tree in enumerate(forest):
    pred = tree.predict(x_test)
    forest_preds[i] = pred

from scipy.stats import mode

yhat, n_votes = mode(forest_preds, axis=0)
print(np.sum(yhat == y_test) / len(y_test))

0.8608
