# CC-18 Performance Evaluation



In [1]:
%load_ext lab_black

In [2]:
import sys
import os
from pathlib import Path
import numpy as np
import collections
from tqdm import tqdm
from pathlib import Path
import time
import logging
import json

sys.path.append("../")

from oblique_forests.sporf import ObliqueForestClassifier

# from oblique_forests.ensemble import RandomForestClassifier
from rerf.rerfClassifier import rerfClassifier

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as skrf
import openml

from joblib import Parallel, delayed

from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import cohen_kappa_score

%load_ext autoreload
%autoreload 2

In [3]:
openml.config.apikey = "e5bef9acec2f51e174bf75ac6c8c3fcb"

In [4]:
benchmark_suite = openml.study.get_suite("OpenML-CC18")  # obtain the benchmark suite

## Helper Functions

In [47]:
def _run_task_helper(clfs, task_id, save_path, overwrite):
    task = openml.tasks.get_task(task_id)  # download the OpenML task
    task_name = task.get_dataset().name

    if not overwrite and os.path.isfile(save_path):
        logging.info(f"APPEND MODE: Skipping {task_name} (already exists)")
        return

    print(f"Running {task_name} ({task_id})")
    logging.info(f"Running {task_name} ({task_id})")

    X, y = task.get_X_and_y()  # get the data

    # TODO: what does this do?
    nominal_indices = task.get_dataset().get_features_by_type(
        "nominal", [task.target_name]
    )
    try:
        print("Running Train Test now...")
        train_test(X, y, task_name, task_id, nominal_indices, clfs, save_path)
    except Exception as e:
        print(
            f"Test {task_name} ({task_id}) Failed | X.shape={X.shape} | {len(nominal_indices)} nominal indices"
        )
        print(e)
        logging.error(
            f"Test {task_name} ({task_id}) Failed | X.shape={X.shape} | {len(nominal_indices)} nominal indices"
        )
        import traceback

        logging.error(e)
        traceback.sprint_exc()

In [48]:
def stratify_samplesizes(y, block_lengths):
    """
    Sort data and labels into blocks that preserve class balance

    Parameters
    ----------
    X: data matrix
    y : 1D class labels
    block_lengths : Block sizes to sort X,y into that preserve class balance
    """
    clss, counts = np.unique(y, return_counts=True)
    ratios = counts / sum(counts)
    class_idxs = [np.where(y == i)[0] for i in clss]

    sort_idxs = []

    prior_idxs = np.zeros(len(clss)).astype(int)
    for n in block_lengths:
        get_idxs = np.rint((n - len(clss)) * ratios).astype(int) + 1
        for idxs, prior_idx, next_idx in zip(class_idxs, prior_idxs, get_idxs):
            sort_idxs.append(idxs[prior_idx:next_idx])
        prior_idxs = get_idxs

    sort_idxs = np.hstack(sort_idxs)

    return sort_idxs

In [68]:
def _check_nested_equality(lst1, lst2):
    if isinstance(lst1, list) and isinstance(lst2, list):
        for l1, l2 in zip(lst1, lst2):
            if not _check_nested_equality(l1, l2):
                return False
    elif isinstance(lst1, np.ndarray) and isinstance(lst2, np.ndarray):
        return np.all(lst1 == lst2)
    else:
        return lst1 == lst2

    return True

In [69]:
class NumpyEncoder(json.JSONEncoder):
    """Special json encoder for numpy types.

    Pass to json.dump(), or json.load().
    """

    def default(self, obj):  # noqa
        if isinstance(
            obj,
            (
                np.int_,
                np.intc,
                np.intp,
                np.int8,
                np.int16,
                np.int32,
                np.int64,
                np.uint8,
                np.uint16,
                np.uint32,
                np.uint64,
            ),
        ):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):  # This is the fix
            return obj.tolist()
        elif isinstance(obj, (datetime, date)):
            return obj.isoformat()
        return json.JSONEncoder.default(self, obj)

In [70]:
def train_test(X, y, task_name, task_id, nominal_indices, clfs, save_path):
    # Set up Cross validation
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=0)
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))

    if vary_samples:
        sample_sizes = np.logspace(
            np.log10(n_classes * 2),
            np.log10(np.floor(len(y) * (cv - 1.1) / cv)),
            num=10,
            endpoint=True,
            dtype=int,
        )
    else:
        sample_sizes = [len(y)]

    # Check if existing experiments
    results_dict = {
        "task": task_name,
        "task_id": task_id,
        "n_samples": n_samples,
        "n_features": n_features,
        "n_classes": n_classes,
        "y": y,
        "test_indices": [],
        "n_estimators": n_estimators,
        "cv": cv,
        "nominal_features": len(nominal_indices),
        "sample_sizes": sample_sizes,
    }

    # Get numeric indices first
    numeric_indices = np.delete(np.arange(X.shape[1]), nominal_indices)

    # Numeric Preprocessing
    numeric_transformer = SimpleImputer(strategy="median")

    # Nominal preprocessing
    nominal_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
        ]
    )

    transformers = []
    if len(numeric_indices) > 0:
        transformers += [("numeric", numeric_transformer, numeric_indices)]
    if len(nominal_indices) > 0:
        transformers += [("nominal", nominal_transformer, nominal_indices)]
    preprocessor = ColumnTransformer(transformers=transformers)

    _, n_features_fitted = preprocessor.fit_transform(X, y).shape
    results_dict["n_features_fitted"] = n_features_fitted
    print(
        f"Features={n_features}, nominal={len(nominal_indices)} (After transforming={n_features_fitted})"
    )

    # Store training indices (random state insures consistent across clfs)
    for train_index, test_index in skf.split(X, y):
        

    # split X, y data based on cross-validation
    for train_index, test_index in skf.split(X, y):
        # Store training indices (random state insures consistent across clfs)
        results_dict["test_indices"].append(test_index)
        results_dict[f"{clf_name}_metadata"]["train_times"] = []
        results_dict[f"{clf_name}_metadata"]["test_times"] = []
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # vary sample sizes
        if vary_samples:
            stratified_sort = stratify_samplesizes(y_train, sample_sizes)
            X_train = X_train[stratified_sort]
            y_train = y_train[stratified_sort]

        for clf_name, clf in clfs:
            pipeline = Pipeline(
                steps=[("Preprocessor", preprocessor), ("Estimator", clf)]
            )

            fold_probas = []
            oob_fold_probas = []
            if not f"{clf_name}_metadata" in results_dict.keys():
                results_dict[f"{clf_name}_metadata"] = {}

            probas_vs_sample_sizes = []
            oob_probas_vs_sample_sizes = []

            # loop over all sample sizes
            for n_samples in sample_sizes:
                start_time = time.time()
                # Fix too few samples for internal CV of these methods
                if (
                    clf_name in ["IRF", "SigRF"]
                    and np.min(np.unique(y_train[:n_samples], return_counts=True)[1])
                    < 5
                ):
                    print(
                        f"{clf_name} requires more samples of minimum class. Skipping n={n_samples}"
                    )
                    y_proba = np.repeat(
                        np.bincount(y_train[:n_samples]).reshape(1, -1)
                        / len(y_train[:n_samples]),
                        X_test.shape[0],
                        axis=0,
                    )
                    # y_proba_oob = y_proba
                    train_time = time.time() - start_time
                else:
                    pipeline = pipeline.fit(X_train[:n_samples], y_train[:n_samples])
                    train_time = time.time() - start_time
                    y_proba = pipeline.predict_proba(X_test)
                    # y_proba_oob = predict_proba_oob(pipeline['Estimator'], pipeline['Preprocessor'].transform(X_train[:n_samples]))

                test_time = time.time() - (train_time + start_time)

                probas_vs_sample_sizes.append(y_proba)
                # oob_probas_vs_sample_sizes.append(y_proba_oob)
                results_dict[f"{clf_name}_metadata"]["train_times"].append(train_time)
                results_dict[f"{clf_name}_metadata"]["test_times"].append(test_time)
                # results_dict[clf_name + '_oob'] = oob_fold_probas
                print(
                    f"{clf_name} Time: train_time={train_time:.3f}, test_time={test_time:.3f}, Cohen Kappa={cohen_kappa_score(y_test, y_proba.argmax(1)):.3f}"
                )
                
            fold_probas.append(probas_vs_sample_sizes)
            # oob_fold_probas.append(oob_probas_vs_sample_sizes)

            results_dict[clf_name] = fold_probas


    # If existing data, load and append to. Else save
    if not os.path.isfile(save_path) or overwrite:
        with open(save_path, "w") as fout:
            json.dump(results_dict, fout, cls=NumpyEncoder)

## Set Hyperparameters on Which Tasks to Run

In [63]:
n_jobs = -1
n_estimators = 500
feature_combinations = 1.5
max_features = 1.0

In [64]:
# rerf_rf_clf = rerfClassifier(
#     n_estimators=n_estimators, projection_matrix="Base", n_jobs=n_jobs
# )
# rerf_clf = rerfClassifier(
#     n_estimators=n_estimators, projection_matrix="RerF", n_jobs=n_jobs
# )
cy_of_clf = ObliqueForestClassifier(
    n_estimators=n_estimators,
    feature_combinations=feature_combinations,
    max_features=max_features,
    n_jobs=n_jobs,
)
# rf_clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs)
skrf_clf = skrf(n_estimators=n_estimators, n_jobs=n_jobs)

clfs = [("RF", skrf_clf), ("CYSPORF", cy_of_clf)]

In [65]:
# if both start/stop are None, then run on all tasks
start_id = None
stop_id = None

name = "hackerman_master"
overwrite = True

# cross validation
cv = 10

vary_samples = False

# hyperparameters of forest
max_features = None

# directory to save the output
data_dir = Path("/home/adam2392/Downloads")

# folder to save results
folder = data_dir / f"sporf_benchmarks/results_cv{cv}_features={max_features}_{name}"
folder.mkdir(exist_ok=True, parents=True)

In [66]:
# get the task IDs to run
task_ids_to_run = []
for task_id in benchmark_suite.tasks:
    if start_id is not None and task_id < start_id:
        print(f"Skipping task_id={task_id}")
        #         logging.info(f'Skipping task_id={task_id}')
        continue
    if stop_id is not None and task_id >= stop_id:
        print(f"Stopping at task_id={task_id}")
        #         logging.info(f'Stopping at task_id={task_id}')
        break
    task_ids_to_run.append(task_id)

print(f"Analyzing {task_ids_to_run} tasks.")

Analyzing [3, 6, 11, 12, 14, 15, 16, 18, 22, 23, 28, 29, 31, 32, 37, 43, 45, 49, 53, 219, 2074, 2079, 3021, 3022, 3481, 3549, 3560, 3573, 3902, 3903, 3904, 3913, 3917, 3918, 7592, 9910, 9946, 9952, 9957, 9960, 9964, 9971, 9976, 9977, 9978, 9981, 9985, 10093, 10101, 14952, 14954, 14965, 14969, 14970, 125920, 125922, 146195, 146800, 146817, 146819, 146820, 146821, 146822, 146824, 146825, 167119, 167120, 167121, 167124, 167125, 167140, 167141] tasks.


## Run Benchmarks

In [None]:
for task_id in tqdm(task_ids_to_run):  # iterate over all tasks
    task = openml.tasks.get_task(task_id)  # download the OpenML task
    task_name = task.get_dataset().name

    # where to save results
    save_path = f"{folder}/cc_18_{task_name}_results_dict.csv"
    
    print(f'Going to run {task_name} now and save to {save_path}')
    _run_task_helper(clfs, task_id, save_path, overwrite)

  0%|          | 0/72 [00:00<?, ?it/s]

Going to run kr-vs-kp now and save to /home/adam2392/Downloads/sporf_benchmarks/results_cv10_features=None_hackerman_master/cc_18_kr-vs-kp_results_dict.csv
Running kr-vs-kp (3)
Running Train Test now...
Features=36, nominal=36 (After transforming=73)
RF Time: train_time=1.158, test_time=0.132, Cohen Kappa=0.975
CYSPORF Time: train_time=2.749, test_time=0.156, Cohen Kappa=0.987
RF Time: train_time=1.167, test_time=0.133, Cohen Kappa=0.981
CYSPORF Time: train_time=2.802, test_time=0.158, Cohen Kappa=0.987
RF Time: train_time=1.159, test_time=0.130, Cohen Kappa=0.981
CYSPORF Time: train_time=2.885, test_time=0.149, Cohen Kappa=0.987
RF Time: train_time=1.181, test_time=0.131, Cohen Kappa=0.987
CYSPORF Time: train_time=2.820, test_time=0.153, Cohen Kappa=0.994
RF Time: train_time=1.159, test_time=0.142, Cohen Kappa=0.987
CYSPORF Time: train_time=2.903, test_time=0.136, Cohen Kappa=0.987
RF Time: train_time=1.165, test_time=0.131, Cohen Kappa=1.000
CYSPORF Time: train_time=2.845, test_time=

  1%|▏         | 1/72 [00:45<53:18, 45.05s/it]

CYSPORF Time: train_time=2.943, test_time=0.156, Cohen Kappa=1.000
Going to run letter now and save to /home/adam2392/Downloads/sporf_benchmarks/results_cv10_features=None_hackerman_master/cc_18_letter_results_dict.csv
Running letter (6)
Running Train Test now...
Features=16, nominal=0 (After transforming=16)
RF Time: train_time=1.981, test_time=0.157, Cohen Kappa=0.975
CYSPORF Time: train_time=30.509, test_time=0.189, Cohen Kappa=0.973
RF Time: train_time=1.878, test_time=0.160, Cohen Kappa=0.963
CYSPORF Time: train_time=29.297, test_time=0.195, Cohen Kappa=0.960
RF Time: train_time=1.829, test_time=0.167, Cohen Kappa=0.968
CYSPORF Time: train_time=29.444, test_time=0.190, Cohen Kappa=0.968
RF Time: train_time=1.768, test_time=0.200, Cohen Kappa=0.970
CYSPORF Time: train_time=29.245, test_time=0.195, Cohen Kappa=0.967
RF Time: train_time=1.782, test_time=0.187, Cohen Kappa=0.965
CYSPORF Time: train_time=29.461, test_time=0.185, Cohen Kappa=0.963
RF Time: train_time=1.801, test_time=0.

### Old Runtime Checks

In [12]:
for task_id in benchmark_suite.tasks:  # iterate over all tasks
    task = openml.tasks.get_task(task_id)  # download the OpenML task
    X, y = task.get_X_and_y()  # get the data
    performance = collections.defaultdict(list)
    print(task)

    #     X_train, X_test, y_train, y_test = train_test_split(
    #         X, y, test_size=0.33, random_state=42)

    for name, clf in zip(["SKRf", "Rf"], [skrf_clf, rf_clf]):
        print(name, clf)
        #         run = openml.runs.run_model_on_task(clf, task)  # run the classifier on the task
        #         score = run.get_metric_score(sklearn.metrics.accuracy_score)  # print accuracy score
        score = cross_val_score(clf, X, y, cv=3)

        print("Data set: %s; Accuracy: %0.2f" % (task.get_dataset().name, score.mean()))

        performance[name].extend(score)

    print(f"Completed {task_id}")
    task_metrics[task.get_dataset().name] = performance
#     break

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3
Task URL.............: https://www.openml.org/t/3
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: kr-vs-kp; Accuracy: 0.94
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: kr-vs-kp; Accuracy: 0.94
Completed 3
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 6
Task URL.............: https://www.openml.org/t/6
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 26
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: letter; Accuracy: 0.96
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: letter; Accuracy: 0.96
Com

Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3

Data set: breast-w; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/

Data set: breast-w; Accuracy: nan
Completed 15
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 16
Task URL.............: https://www.openml.org/t/16
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 10
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: mfeat-karhunen; Accuracy: 0.97
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: mfeat-karhunen; Accuracy: 0.96
Completed 16
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 18
Task URL.............: https://www.openml.org/t/18
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 10
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: mfeat-morphological; Accuracy: 0.72
Rf RandomFore

Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3

Data set: credit-approval; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/

Data set: credit-approval; Accuracy: nan
Completed 29
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 31
Task URL.............: https://www.openml.org/t/31
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: credit-g; Accuracy: 0.77
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: credit-g; Accuracy: 0.76
Completed 31
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 32
Task URL.............: https://www.openml.org/t/32
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 10
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: pendigits; Accuracy: 0.98
Rf RandomForestClassifier(n_e

Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3

Data set: eucalyptus; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/

Data set: eucalyptus; Accuracy: nan
Completed 2079
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3021
Task URL.............: https://www.openml.org/t/3021
Estimation Procedure.: crossvalidation
Target Feature.......: Class
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3

Data set: sick; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/

Data set: sick; Accuracy: nan
Completed 3021
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3022
Task URL.............: https://www.openml.org/t/3022
Estimation Procedure.: crossvalidation
Target Feature.......: Class
# of Classes.........: 6
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: vowel; Accuracy: 0.60
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: vowel; Accuracy: 0.59
Completed 3022
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3481
Task URL.............: https://www.openml.org/t/3481
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 26
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: isolet; Accuracy: 0.94
Rf RandomForestClassifier(n_estimator

Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    else:
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validatio

Data set: jm1; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    else:
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/va

Data set: jm1; Accuracy: nan
Completed 3904
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3913
Task URL.............: https://www.openml.org/t/3913
Estimation Procedure.: crossvalidation
Target Feature.......: problems
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: kc2; Accuracy: 0.80
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: kc2; Accuracy: 0.79
Completed 3913
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3917
Task URL.............: https://www.openml.org/t/3917
Estimation Procedure.: crossvalidation
Target Feature.......: defects
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: kc1; Accuracy: 0.81
Rf RandomForestClassifier(n_estimators=50

Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-p

Data set: adult; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8

Data set: adult; Accuracy: nan
Completed 7592
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 9910
Task URL.............: https://www.openml.org/t/9910
Estimation Procedure.: crossvalidation
Target Feature.......: target
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: Bioresponse; Accuracy: 0.79
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: Bioresponse; Accuracy: 0.79
Completed 9910
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 9946
Task URL.............: https://www.openml.org/t/9946
Estimation Procedure.: crossvalidation
Target Feature.......: Class
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: wdbc; Accuracy: 0.96
Rf RandomForestClassifier(

Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-p

Data set: cylinder-bands; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8

Data set: cylinder-bands; Accuracy: nan
Completed 14954
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 14965
Task URL.............: https://www.openml.org/t/14965
Estimation Procedure.: crossvalidation
Target Feature.......: Class
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: bank-marketing; Accuracy: 0.63
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: bank-marketing; Accuracy: 0.63
Completed 14965
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 14969
Task URL.............: https://www.openml.org/t/14969
Estimation Procedure.: crossvalidation
Target Feature.......: Phase
# of Classes.........: 5
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: GesturePhaseSegmentationPro

Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-p

Data set: dresses-sales; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8

Data set: dresses-sales; Accuracy: nan
Completed 125920
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 125922
Task URL.............: https://www.openml.org/t/125922
Estimation Procedure.: crossvalidation
Target Feature.......: Class
# of Classes.........: 11
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: texture; Accuracy: 0.98
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: texture; Accuracy: 0.98
Completed 125922
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 146195
Task URL.............: https://www.openml.org/t/146195
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 3
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: connect-4; Accuracy: 0.57
Rf Random

Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-p

Data set: MiceProtein; Accuracy: nan
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)


Traceback (most recent call last):
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/adam2392/Documents/manifold_random_forests/oblique_forests/ensemble/_forest.py", line 308, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/adam2392/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/adam2392/.local/lib/python3.8

Data set: MiceProtein; Accuracy: nan
Completed 146800
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 146817
Task URL.............: https://www.openml.org/t/146817
Estimation Procedure.: crossvalidation
Target Feature.......: target
# of Classes.........: 7
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: steel-plates-fault; Accuracy: 0.59
Rf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: steel-plates-fault; Accuracy: 0.58
Completed 146817
OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 146819
Task URL.............: https://www.openml.org/t/146819
Estimation Procedure.: crossvalidation
Target Feature.......: outcome
# of Classes.........: 2
Cost Matrix..........: Available
SKRf RandomForestClassifier(n_estimators=500, n_jobs=-1)
Data set: climate-model

In [None]:
for task_id in benchmark_suite.tasks:  # iterate over all tasks
    task = openml.tasks.get_task(task_id)  # download the OpenML task
    X, y = task.get_X_and_y()  # get the data
    performance = collections.defaultdict(list)
    print(task)

#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.33, random_state=42)

    for name, clf in zip(['Of'], [cy_of_clf]):
        print(name, clf)
#         run = openml.runs.run_model_on_task(clf, task)  # run the classifier on the task
#         score = run.get_metric_score(sklearn.metrics.accuracy_score)  # print accuracy score
        score = cross_val_score(clf, X, y, cv=3)
        
        print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
        
        performance[name].extend(score)
    
    print(f'Completed {task_id}')
    task_metrics[task.get_dataset().name] = performance
#     break