In [1]:
"""
===============================================================================
Comparing different split criteria for random forest regression on toy datasets
===============================================================================
An example to compare the different split criteria available for
:class:`sklearn.ensemble.RandomForestRegressor`.
Metrics used to evaluate these splitters include Mean Squared Error (MSE), a
measure of distance between the true target (`y_true`) and the predicted output
(`y_pred`), and runtime.
For visual examples of these datasets, see
:ref:`sphx_glr_auto_examples_datasets_plot_nonlinear_regression_datasets.py`.
"""
# Authors: Vivek Gopalakrishnan <vgopala4@jhu.edu>
#          Morgan Sanchez       <msanch35@jhu.edu>
# License: BSD 3 clause

import time
from itertools import product
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
print(__doc__)
random_state = 0


  if img.ndim is 0:


ImportError: dlopen(/Users/Celina/scikit-learn/sklearn/svm/libsvm.cpython-38-darwin.so, 2): Symbol not found: _svm_check_parameter
  Referenced from: /Users/Celina/scikit-learn/sklearn/svm/libsvm.cpython-38-darwin.so
  Expected in: flat namespace
 in /Users/Celina/scikit-learn/sklearn/svm/libsvm.cpython-38-darwin.so

In [None]:
###############################################################################
noise = 100.0
simulations = {"Olivetti": []}
###############################################################################
def _train_forest(X, y, criterion):
    """Fit a RandomForestRegressor with default parameters and specific criterion."""
    regr = RandomForestRegressor(
        n_estimators=500, criterion=criterion, max_features="sqrt", max_depth=5
    )
    regr.fit(X, y)
    return regr
def _test_forest(X, y, regr):
    """Calculate the accuracy of the model on a heldout set."""
    y_pred = regr.predict(X)
    return mean_squared_error(y, y_pred)
def prepData():
    X, y = load_boston(return_X_y=True)
    lstat = X[:,12].reshape(-1,1)
    y = y.reshape(-1,1)
    targets = np.concatenate((y, lstat),axis=1)
    data = np.delete(X, 12, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
    return X_train, y_train, X_test, y_test
def _prep_data(sim_dict, simulation_name, max_n_samples, n_trials):
    """Generate train and test data for all trials."""
    # Get simulation parameters and validation dataset
    print(np.shape(simulations[simulation_name]))
    X_train, y_train, X_test, y_test = simulations[simulation_name]
    n_samples = int(max_n_samples)
    np.random.seed(random_state)
    np.random.shuffle(X_train)
    np.random.seed(random_state)
    np.random.shuffle(y_train)
    sim_dict[simulation_name] = {}
    for i in range(n_trials):
        # Sample training data
        np.random.seed(i)
        np.random.shuffle(X_train)
        np.random.seed(i)
        np.random.shuffle(y_train)
        sim_dict[simulation_name][i] = (
            np.copy(X_train),
            np.copy(y_train),
            np.copy(X_test),
            np.copy(y_test),
        )
    return sim_dict

In [2]:
###############################################################################
def main(simulation_name, sim_data, n_samples, criterion, n_iter):
    """Measure the performance of RandomForest under simulation conditions.
    Parameters
    ----------
    simulation_name : str
        Key from `simulations` dictionary.
    sim_data: tuple (X_train, y_train, X_test, y_test)
            X_train : array, shape (n_train_samples, n_features)
                All X training data for given simulation
            y_train : array, shape (n_train_samples, n_outputs)
                All y training data for given simulation
            X_test : array, shape (n_test_samples, n_features)
                All X testing data for given simulation
            y_test : array, shape (n_test_samples, n_outputs)
                All y testing data for given simulation
    n_samples : int
        Number of training samples.
    criterion : {'mse', 'mae', 'friedman_mse'}
        Split criterion used to train forest:
        - 'mse'
            Mean Squared Error
        - 'mae'
            Mean Absolute Error
        - 'friedman_mse'
            Friedman Mean Squared Error
    n_iter : int
        Which repeat of the same simulation parameter we're on. Ignored.
    Returns
    -------
    simulation_name : str
        Key from `simulations` dictionary.
    n_samples : int
        Number of training samples.
    criterion : string
        Split criterion used to train forest. Choose from
        ("mse", "mae", "friedman_mse", "axis", "oblique").
    score : float
        Euclidean distance between y_pred and y_test.
    runtime : float
        Runtime (in seconds).
    """
    print(simulation_name, n_samples, criterion, n_iter)
    # Unpack training and testing data
    X_train, y_train, X_test, y_test = sim_data
    # Get subset of training data
    curr_X_train = X_train[0:n_samples]
    curr_y_train = y_train[0:n_samples]
    # Train forest
    start = time.process_time()
    regr = _train_forest(curr_X_train, curr_y_train, criterion)
    stop = time.process_time()
    # Evaluate on testing data and record runtime
    mse = _test_forest(X_test, y_test, regr)
    runtime = stop - start
    return (simulation_name, n_samples, criterion, mse, runtime)

In [3]:
###############################################################################
print("Constructing parameter space...")
# Declare simulation parameters
simulation_names = simulations.keys()
sample_sizes = np.arange(25, 200, 25)
criteria = ["mse", "friedman_mse", "axis", "oblique"]
# Number of times to repeat each simulation setting
n_repeats = 30
# Create the parameter space
params = product(simulation_names, sample_sizes, criteria, range(n_repeats))
###############################################################################
print("Constructing training and validation datasets...")
for simulation_name in simulations.keys():
    X_train, y_train, X_test, y_test = prepData()
    simulations[simulation_name].append(X_train)
    simulations[simulation_name].append(y_train)
    simulations[simulation_name].append(X_test)
    simulations[simulation_name].append(y_test)
###############################################################################
print("Running simulation...")
# Generate training and test data for simulations
sim_data = {}
for sim in simulation_names:
    sim_data = _prep_data(sim_data, sim, sample_sizes[-1], n_repeats)
# Run the simulations in parallel
data = Parallel(n_jobs=-2)(
    delayed(main)(sim_name, sim_data[sim_name][n_iter], n, crit, n_iter)
    for sim_name, n, crit, n_iter in params
)
# Save results as a DataFrame
columns = ["simulation", "n_samples", "criterion", "mse", "runtime"]
df = pd.DataFrame(data, columns=columns)
# Plot the results
sns.relplot(
    x="n_samples",
    y="mse",
    hue="criterion",
    col="simulation",
    kind="line",
    data=df,
    facet_kws={"sharey": False, "sharex": True},
)
plt.tight_layout()
plt.savefig("splitter_comparison_faces_mse_04_12.png")
plt.show()
# Plot the results
sns.relplot(
    x="n_samples",
    y="runtime",
    hue="criterion",
    col="simulation",
    kind="line",
    data=df,
    facet_kws={"sharey": False, "sharex": True},
)
plt.tight_layout()
plt.savefig("splitter_comparison_faces_runtime_04_12.png")
plt.show()

Constructing parameter space...


NameError: name 'simulations' is not defined