# OpenML CC-18

### Load in datasets

In [1]:
import openml
import sklearn
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
import math
from math import log

import warnings
warnings.filterwarnings('ignore')

benchmark_suite = openml.study.get_suite('OpenML-CC18')  # obtain the benchmark suite

### Hyperparameter optimization function

In [2]:
def hyperparameter_optimization_grid(X, y, *argv):
    """
    Given a classifier and a dictionary of hyperparameters, find optimal hyperparameters using GridSearchCV.
    Parameters
    ----------
    X : numpy.ndarray
        Input data, shape (n_samples, n_features)
    y : numpy.ndarray
        Output data, shape (n_samples, n_outputs)
    *argv : list of tuples (classifier, hyperparameters)
        List of (classifier, hyperparameters) tuples:
        classifier : sklearn-compliant classifier
            For example sklearn.ensemble.RandomForestRegressor, rerf.rerfClassifier, etc
        hyperparameters : dictionary of hyperparameter ranges
            See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html.
    Returns
    -------
    clf_best_params : dictionary
        Dictionary of best hyperparameters
    """

    clf_best_params = {}

    # Iterate over all (classifier, hyperparameters) pairs
    for clf, params in argv:

        # Run grid search
        grid_search = GridSearchCV(
            clf, param_grid=params, cv=10, iid=False
        )
        grid_search.fit(X, y)

        # Save results
        clf_best_params[clf] = grid_search.best_params_

    return clf_best_params

### Find optimized hyperparameters

In [None]:
dimen_CC18 = []
best_params = []

for task_id in benchmark_suite.tasks[0:5]:  # iterate over all tasks
#     try:
        f = open("SPORF_accuracies_CC-18_hyperpara.txt","a")
        task = openml.tasks.get_task(task_id)  # download the OpenML task
        X_CC18, y_CC18 = task.get_X_and_y()  # get the data
        dimen_CC18.append(np.shape(X_CC18))
        n_features = np.shape(X_CC18)[1]
        n_samples = np.shape(X_CC18)[0]

        # build a classifier
        clf = rerfClassifier(n_estimators=100)
        
        #specify max_depth and min_sample_splits ranges
        max_depth_array = (np.unique(np.round((np.arange(2,math.log(n_samples),
                            (math.log(n_samples)-2)/10))))).astype(int)
        max_depth_range = np.append(max_depth_array, None)

        min_sample_splits_range = (np.unique(np.round((np.arange(1,math.log(n_samples),
                                    (math.log(n_samples)-2)/10))))).astype(int)

        # specify parameters and distributions to sample from
        param_dist = {"n_estimators": np.arange(100,550,25),
              "max_depth": max_depth_range,
              "min_samples_split": min_sample_splits_range,
              "feature_combinations": [1,2,3,4,5], 
              "max_features": ["auto", "sqrt","log2", None, n_features**2]}

        clf_best_params = hyperparameter_optimization_grid(X_CC18, y_CC18, (clf, param_dist))
        best_params.append(clf_best_params, axis = 0)
        print(task_id)
        print('Data set: %s: ' % (task.get_dataset().name))
        print(clf_best_params)
        print('Time: '+ str(datetime.now() - startTime))
        f.write('%i,%s,%s,%f,%f,%f,%f,%f\n' % (task_id,task.get_dataset().name,str(datetime.now() - startTime),clf_best_params[0],clf_best_params[1],clf_best_params[2],clf_best_params[3],clf_best_params[4]))
        f.close()
#     except:
#         print('Error in OpenML CC-18 dataset ' + str(task_id))



### Run SPORF with optimized hyperparameters

In [2]:
# clf = sklearn.pipeline.make_pipeline(sklearn.preprocessing.Imputer(), rerfClassifier())

# for task_id in benchmark_suite.tasks[68:]:  # iterate over all tasks
#     try:
#         f = open("SPORF_accuracies_CC-18.txt","a")
#         startTime = datetime.now()
#         task = openml.tasks.get_task(task_id)  # download the OpenML task
#         openml.config.apikey = '204cdba18d110fd68ad24b131ea92030'  # set the OpenML Api Key
#         run = openml.runs.run_model_on_task(clf, task)  # run the classifier on the task
#         score = run.get_metric_fn(sklearn.metrics.accuracy_score)  # print accuracy score
#         print(task_id)
#         print('Data set: %s; Accuracy: %0.4f' % (task.get_dataset().name,score.mean()))
#         print('Time: '+ str(datetime.now() - startTime))
#         f.write('%i,%s,%0.4f,%s\n' % (task_id,task.get_dataset().name,score.mean(),str(datetime.now() - startTime)))
#         f.close()
#     except:
#         print('Error in' + str(task_id))


167124
Data set: CIFAR_10; Accuracy: 0.4848
Time: 4:38:49.795906
167125
Data set: Internet-Advertisements; Accuracy: 0.9771
Time: 0:08:30.094706
167140
Data set: dna; Accuracy: 0.9513
Time: 0:01:19.666527
167141
Data set: churn; Accuracy: 0.9380
Time: 0:01:06.676688
