In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X=iris.data[:,2:]
y=iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X,y)

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(
    tree_clf,
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

'digraph Tree {\nnode [shape=box, style="filled, rounded", color="black", fontname="helvetica"] ;\nedge [fontname="helvetica"] ;\n0 [label="petal length (cm) <= 2.45\\ngini = 0.667\\nsamples = 150\\nvalue = [50, 50, 50]\\nclass = setosa", fillcolor="#ffffff"] ;\n1 [label="gini = 0.0\\nsamples = 50\\nvalue = [50, 0, 0]\\nclass = setosa", fillcolor="#e58139"] ;\n0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n2 [label="petal width (cm) <= 1.75\\ngini = 0.5\\nsamples = 100\\nvalue = [0, 50, 50]\\nclass = versicolor", fillcolor="#ffffff"] ;\n0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;\n3 [label="gini = 0.168\\nsamples = 54\\nvalue = [0, 49, 5]\\nclass = versicolor", fillcolor="#4de88e"] ;\n2 -> 3 ;\n4 [label="gini = 0.043\\nsamples = 46\\nvalue = [0, 1, 45]\\nclass = virginica", fillcolor="#843de6"] ;\n2 -> 4 ;\n}'

In [None]:
tree_clf.predict_proba([[5,1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [None]:
tree_clf.predict([[5,1.5]])

array([1])

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X,y)

In [21]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Generate a moons dataset
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

# Split it into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

# Fine-tune the Decision Tree
param_grid = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(tree_clf, param_grid, cv=3, verbose=1)
grid_search_cv.fit(X_train, y_train)

# Print the best parameters
print("Best parameters: ", grid_search_cv.best_params_)

# Train it on the full training set using these hyperparameters
best_tree_clf = grid_search_cv.best_estimator_
best_tree_clf.fit(X_train, y_train)

# Measure the model's performance on the test set
y_pred = best_tree_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy on test set: ", accuracy)

Fitting 3 folds for each of 294 candidates, totalling 882 fits
Best parameters:  {'max_leaf_nodes': 17, 'min_samples_split': 2}
Accuracy on test set:  0.8695


In [None]:
from sklearn.model_selection import ShuffleSplit
from scipy.stats import mode
from sklearn.base import clone
import numpy as np

# Generate 1,000 subsets of the training set
rs = ShuffleSplit(n_splits=1000, train_size=100, random_state=42)
mini_sets = []

for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

# Train one Decision Tree on each subset
forest = [clone(grid_search_cv.best_estimator_) for _ in range(1000)]

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)

    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

print("Mean accuracy: ", np.mean(accuracy_scores))

# Generate the predictions of the 1,000 Decision Trees
Y_pred = np.empty([1000, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

# Keep only the most frequent prediction
y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

# Evaluate these predictions on the test set
accuracy = accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))
print("Accuracy on test set: ", accuracy)

Mean accuracy:  0.805471
Accuracy on test set:  0.872
