In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mode
from sklearn.base import clone
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier

In [None]:
X, y = make_moons(n_samples=10_000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
plt.scatter(X[:,0], X[:,1], c=y)

# Single Tree

In [None]:
models = GridSearchCV(
    DecisionTreeClassifier(),
    {
        'max_depth': [3, 10, 30],
        'max_leaf_nodes': [3, 10, 30, 100],
    },
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
)

In [None]:
models.fit(X_train, y_train)

In [None]:
model = models.best_estimator_
model.fit(X_train, y_train)

In [None]:
accuracy_score(model.predict(X_test), y_test)

# Big ol' forest

In [None]:
shuffle_split = ShuffleSplit(n_splits=1000, train_size=100)

ensemble = []
test_scores = []
test_predictions = []
for idxs, _ in shuffle_split.split(X_train):
    ensemble_model = clone(models.best_estimator_)
    ensemble_model.fit(X_train[idxs,:], y_train[idxs])
    ensemble.append(ensemble_model)
    
    ensemble_model_predictions = ensemble_model.predict(X_test)
    
    test_scores.append(accuracy_score(ensemble_model_predictions, y_test))
    test_predictions.append(ensemble_model_predictions)

In [None]:
test_majority_vote = mode(np.stack(test_predictions)).mode[0,:]

In [None]:
print(f'Average test score: {sum(test_scores)/len(test_scores):.2f}')
print(f'Ensemble test score: {accuracy_score(test_majority_vote, y_test):.2f}')