### Load in datasets

In [1]:
import openml
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

benchmark_suite_CC18 = openml.study.get_suite('OpenML-CC18')  # obtain the benchmark suite
benchmark_suite_100 = openml.study.get_suite('OpenML100')  # obtain the benchmark suite

#### The following sections use the OpenML CC-18 and 100 suites and classifies each dataset using the sklearn's RandomForest classifier and ExtraTrees classifier with default parameters. The task IDs, accuracies, and runtimes of each dataset are compiled into .txt files for later analysis

### RFClassifier

In [2]:
# build a scikit-learn classifier
clf = sklearn.pipeline.make_pipeline(sklearn.preprocessing.Imputer(),
                                     sklearn.ensemble.RandomForestClassifier())

for task_id in benchmark_suite_CC18.tasks:  # iterate over all tasks
    try:
        f = open("RF_accuracies_CC-18_21020.txt","a")
        startTime = datetime.now()
        task = openml.tasks.get_task(task_id) # download the OpenML task
        openml.config.apikey = 'c9ea8896542dd998ea42685f14e2bc14'  # set the OpenML Api Key
        run = openml.runs.run_model_on_task(clf, task) # run classifier on splits (requires API key)
        score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score
        print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
        print('Time: '+ str(datetime.now() - startTime))
        f.write('%i,%s,%0.4f,%s,\n' % (task_id,task.get_dataset().name,score.mean(),str(datetime.now() - startTime)))
        f.close()
    except:
        print('Error in' + str(task_id))

for task_id in benchmark_suite_100.tasks:  # iterate over all tasks
    try:
        f = open("RF_accuracies_100_21020.txt","a")
        startTime = datetime.now()
        task = openml.tasks.get_task(task_id) # download the OpenML task
        openml.config.apikey = 'c9ea8896542dd998ea42685f14e2bc14'  # set the OpenML Api Key
        run = openml.runs.run_model_on_task(clf, task) # run classifier on splits (requires API key)
        score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score
        print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
        print('Time: '+ str(datetime.now() - startTime))
        f.write('%i,%s,%0.4f,%s,\n' % (task_id,task.get_dataset().name,score.mean(),str(datetime.now() - startTime)))
        f.close()
    except:
        print('Error in' + str(task_id))

Data set: kr-vs-kp; Accuracy: 0.99
Time: 0:00:01.526601
Data set: letter; Accuracy: 0.94
Time: 0:00:02.926931
Data set: balance-scale; Accuracy: 0.82
Time: 0:00:00.724396
Data set: mfeat-factors; Accuracy: 0.95
Time: 0:00:01.880905
Data set: mfeat-fourier; Accuracy: 0.80
Time: 0:00:01.663376
Data set: breast-w; Accuracy: 0.97
Time: 0:00:00.717504
Data set: mfeat-karhunen; Accuracy: 0.92
Time: 0:00:01.648532
Data set: mfeat-morphological; Accuracy: 0.70
Time: 0:00:00.839850
Data set: mfeat-zernike; Accuracy: 0.73
Time: 0:00:01.438814
Data set: cmc; Accuracy: 0.51
Time: 0:00:00.921855
Data set: optdigits; Accuracy: 0.97
Time: 0:00:01.522000
Data set: credit-approval; Accuracy: 0.86
Time: 0:00:00.890651
Data set: credit-g; Accuracy: 0.74
Time: 0:00:01.000023
Data set: pendigits; Accuracy: 0.99
Time: 0:00:01.846530
Data set: diabetes; Accuracy: 0.76
Time: 0:00:00.731372
Data set: spambase; Accuracy: 0.94
Time: 0:00:01.357202
Data set: splice; Accuracy: 0.93
Time: 0:00:02.529708
Data set: t

Data set: tamilnadu-electricity; Accuracy: 0.99
Time: 0:00:06.546566
Data set: hill-valley; Accuracy: 0.57
Time: 0:00:01.717879
Data set: ilpd; Accuracy: 0.69
Time: 0:00:00.736610
Data set: madelon; Accuracy: 0.64
Time: 0:00:04.173558
Data set: nomao; Accuracy: 0.97
Time: 0:00:08.921232
Data set: ozone-level-8hr; Accuracy: 0.94
Time: 0:00:01.336889
Data set: cardiotocography; Accuracy: 1.00
Time: 0:00:00.974932
Data set: climate-model-simulation-crashes; Accuracy: 0.91
Time: 0:00:00.797164
Data set: cnae-9; Accuracy: 0.91
Time: 0:00:02.925310
Data set: eeg-eye-state; Accuracy: 0.90
Time: 0:00:02.880070
Data set: first-order-theorem-proving; Accuracy: 0.61
Time: 0:00:02.526635
Data set: gas-drift; Accuracy: 0.99
Time: 0:00:10.358647
Data set: banknote-authentication; Accuracy: 0.99
Time: 0:00:00.762282
Data set: blood-transfusion-service-center; Accuracy: 0.75
Time: 0:00:00.747457
Data set: artificial-characters; Accuracy: 0.92
Time: 0:00:01.545461
Data set: bank-marketing; Accuracy: 0.

### ExtraTrees Classifier

In [3]:
# build a scikit-learn classifier
clf = sklearn.pipeline.make_pipeline(sklearn.preprocessing.Imputer(),
                                     sklearn.ensemble.ExtraTreesClassifier())

for task_id in benchmark_suite_CC18.tasks:  # iterate over all tasks
    try:
        f = open("ET_accuracies_CC-18_21020.txt","a")
        startTime = datetime.now()
        task = openml.tasks.get_task(task_id) # download the OpenML task
        openml.config.apikey = 'c9ea8896542dd998ea42685f14e2bc14'  # set the OpenML Api Key
        run = openml.runs.run_model_on_task(clf, task) # run classifier on splits (requires API key)
        score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score
        print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
        print('Time: '+ str(datetime.now() - startTime))
        f.write('%i,%s,%0.4f,%s,\n' % (task_id,task.get_dataset().name,score.mean(),str(datetime.now() - startTime)))
        f.close()
    except:
        print('Error in' + str(task_id))

for task_id in benchmark_suite_100.tasks:  # iterate over all tasks
    try:
        f = open("ET_accuracies_100_21020.txt","a")
        startTime = datetime.now()
        task = openml.tasks.get_task(task_id) # download the OpenML task
        openml.config.apikey = 'c9ea8896542dd998ea42685f14e2bc14'  # set the OpenML Api Key
        run = openml.runs.run_model_on_task(clf, task) # run classifier on splits (requires API key)
        score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score
        print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
        print('Time: '+ str(datetime.now() - startTime))
        f.write('%i,%s,%0.4f,%s,\n' % (task_id,task.get_dataset().name,score.mean(),str(datetime.now() - startTime)))
        f.close()
    except:
        print('Error in' + str(task_id))

Data set: kr-vs-kp; Accuracy: 0.99
Time: 0:00:02.298066
Data set: letter; Accuracy: 0.95
Time: 0:00:02.917889
Data set: balance-scale; Accuracy: 0.82
Time: 0:00:00.714775
Data set: mfeat-factors; Accuracy: 0.96
Time: 0:00:01.477624
Data set: mfeat-fourier; Accuracy: 0.80
Time: 0:00:01.063864
Data set: breast-w; Accuracy: 0.96
Time: 0:00:00.714220
Data set: mfeat-karhunen; Accuracy: 0.91
Time: 0:00:01.046754
Data set: mfeat-morphological; Accuracy: 0.69
Time: 0:00:00.857871
Data set: mfeat-zernike; Accuracy: 0.75
Time: 0:00:00.990645
Data set: cmc; Accuracy: 0.49
Time: 0:00:00.912858
Data set: optdigits; Accuracy: 0.97
Time: 0:00:01.376934
Data set: credit-approval; Accuracy: 0.87
Time: 0:00:00.902440
Data set: credit-g; Accuracy: 0.74
Time: 0:00:00.988418
Data set: pendigits; Accuracy: 0.99
Time: 0:00:01.255706
Data set: diabetes; Accuracy: 0.73
Time: 0:00:00.693852
Data set: spambase; Accuracy: 0.95
Time: 0:00:01.202287
Data set: splice; Accuracy: 0.92
Time: 0:00:02.347314
Data set: t

Data set: tamilnadu-electricity; Accuracy: 0.99
Time: 0:00:03.618403
Data set: hill-valley; Accuracy: 0.57
Time: 0:00:01.090678
Data set: ilpd; Accuracy: 0.72
Time: 0:00:00.718591
Data set: madelon; Accuracy: 0.56
Time: 0:00:02.800138
Data set: nomao; Accuracy: 0.97
Time: 0:00:07.482910
Data set: ozone-level-8hr; Accuracy: 0.94
Time: 0:00:01.015006
Data set: cardiotocography; Accuracy: 1.00
Time: 0:00:00.805691
Data set: climate-model-simulation-crashes; Accuracy: 0.90
Time: 0:00:00.707234
Data set: cnae-9; Accuracy: 0.92
Time: 0:00:02.848758
Data set: eeg-eye-state; Accuracy: 0.91
Time: 0:00:01.779308
Data set: first-order-theorem-proving; Accuracy: 0.61
Time: 0:00:01.608508
Data set: gas-drift; Accuracy: 0.99
Time: 0:00:03.499359
Data set: banknote-authentication; Accuracy: 1.00
Time: 0:00:00.992518
Data set: blood-transfusion-service-center; Accuracy: 0.74
Time: 0:00:01.561361
Data set: artificial-characters; Accuracy: 0.95
Time: 0:00:01.357129
Data set: bank-marketing; Accuracy: 0.

#### The following sections use the OpenML CC-18 and 100 suites and classifies each dataset using the sklearn's RandomForest classifier and ExtraTrees classifier with optimized hyperparameters. The task IDs, accuracies, and runtimes of each dataset are compiled into .txt files for later analysis