# Plot CC_18 Benchmarks

In [1]:
%load_ext lab_black

In [26]:
import sys
from pathlib import Path
import numpy as np
import collections
import pickle
from pathlib import Path

import pandas as pd

from sklearn.metrics import cohen_kappa_score

sys.path.append("../")

from oblique_forests.sporf import ObliqueForestClassifier
from rerf.rerfClassifier import rerfClassifier

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Get the data

In [5]:
# if both start/stop are None, then run on all tasks
start_id = None
stop_id = None

name = "hackerman_master"
overwrite = True

# cross validation
cv = 10

vary_samples = False

# hyperparameters of forest
max_features = None

# directory to save the output
data_dir = Path("/home/adam2392/Downloads")

# folder to save results
folder = data_dir / f"sporf_benchmarks/results_cv{cv}_features={max_features}"

result_files = [f for f in folder.glob("*.pkl")]
print(len(result_files))

24


In [55]:
tasks = []
n_samples = []
n_classes = []
sporf_cohens = []
rf_cohens = []
task_ids = []

for fpath in result_files:
    with open(fpath, "rb") as fin:
        result_dict = pickle.load(fin)

    # number of stratified cross-validations
    cv = result_dict["cv"]
    fold_test_inds = result_dict["test_indices"]
    y = result_dict["y"]

    # extract metadata of benchmark experiment
    tasks.append(result_dict["task"])
    n_samples.append(result_dict["n_samples"])
    n_classes.append(result_dict["n_classes"])
    task_ids.append(result_dict["task_id"])
    # compute cohen kappa for both classifiers
    for clf in ["RF", "SPORF"]:
        clf_cohens = []
        fold_probas = result_dict[clf]

        # compute statistic on each fold
        for ifold in range(cv):
            y_proba = fold_probas[ifold][0]
            y_test = y[fold_test_inds[ifold]]
            kappa_score = cohen_kappa_score(y_test, y_proba.argmax(1))
            clf_cohens.append(kappa_score)

        if clf == "RF":
            rf_cohens.append(clf_cohens)
        else:
            sporf_cohens.append(clf_cohens)

In [59]:
print(max(task_ids))

3022


In [34]:
print(np.array(rf_cohens).shape)

(24, 10)


In [47]:
rf_df = pd.DataFrame(rf_cohens)
rf_df["clf"] = "rf"
rf_df = pd.concat((rf_df, result_df), axis=1)

sporf_df = pd.DataFrame(sporf_cohens)
sporf_df["clf"] = "sporf"
sporf_df = pd.concat((sporf_df, result_df), axis=1)

diff_arr = np.array(sporf_cohens) - np.array(rf_cohens)
diff_df = pd.DataFrame(diff_arr)
diff_df = pd.concat((diff_df, result_df), axis=1)

# now form the final dataframe
data_df = pd.concat((rf_df, sporf_df), axis=0)

# print(rf_df.shape)
# display(rf_df.head())
print(data_df.shape)
print(diff_df.shape)
display(diff_df.head())
display(data_df.head())

(48, 14)
(24, 13)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,task,n_samples,n_classes
0,-0.021755,-0.03132,-0.00447,-0.015673,-0.029481,-0.013011,-0.02472,-0.02384779,-0.033643,-0.027564,electricity,45312,2
1,0.013521,0.0104,0.01352,0.01352,0.01404,0.02028,0.016121,0.0249607,0.02548,0.014561,letter,20000,26
2,0.022222,0.033333,0.027778,0.055556,0.022222,0.044444,0.011111,-1.110223e-16,0.022222,0.038889,mfeat-zernike,2000,10
3,0.032409,-0.036712,-0.002091,-0.076747,-0.013029,0.086387,0.01479,0.01796782,0.017947,0.034055,eucalyptus,736,5
4,0.007549,0.0058,0.007847,0.028841,-0.000197,0.005583,-0.000227,0.009911275,0.030455,-0.007639,satimage,6430,6


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,clf,task,n_samples,n_classes
0,0.866237,0.858064,0.838017,0.838281,0.86891,0.856792,0.846496,0.85881,0.861563,0.868185,rf,electricity,45312,2
1,0.957357,0.951638,0.954758,0.952678,0.949558,0.945398,0.944357,0.940198,0.936558,0.949557,rf,letter,20000,26
2,0.716667,0.744444,0.722222,0.705556,0.75,0.711111,0.783333,0.761111,0.727778,0.716667,rf,mfeat-zernike,2000,10
3,0.501857,0.62069,0.598869,0.581527,0.439924,0.566846,0.594836,0.645545,0.527225,0.608102,rf,eucalyptus,736,5
4,0.88411,0.882472,0.876588,0.888363,0.888526,0.898105,0.884393,0.899682,0.884786,0.899701,rf,satimage,6430,6


In [54]:
# melt the dataframe
diff_df_melt = pd.melt(
    diff_df,
    id_vars=["task", "n_samples", "n_classes"],
    value_name="delta_cohen_kappa",
    var_name="cv_fold",
)

print(diff_df.shape)
print(diff_df_melt.shape)
display(diff_df_melt)

(24, 13)
(240, 5)


Unnamed: 0,task,n_samples,n_classes,cv_fold,delta_cohen_kappa
0,electricity,45312,2,0,-0.021755
1,letter,20000,26,0,0.013521
2,mfeat-zernike,2000,10,0,0.022222
3,eucalyptus,736,5,0,0.032409
4,satimage,6430,6,0,0.007549
...,...,...,...,...,...
235,cmc,1473,3,9,-0.005567
236,balance-scale,625,3,9,0.130816
237,sick,3772,2,9,0.021209
238,credit-approval,690,2,9,-0.030978


# Create Plots