# Analysis of Rerf-sporf, Py-sporf and Cythonized-sporf Performance

Here, we are interested in looking at the runtime of each implementation of SPORF on a fixed classification task.

Namely, we will utilize the orthant and sparse-parity tasks in the original SPORF paper.

In [1]:
%load_ext lab_black

In [2]:
import sys
from pathlib import Path
import numpy as np
import collections

# from sklearn.ensemble import RandomForestClassifier as rfc

sys.path.append("../")

from oblique_forests.sporf import ObliqueForestClassifier, PythonObliqueForestClassifier
from rerf.rerfClassifier import rerfClassifier

%load_ext autoreload
%autoreload 2

In [3]:
def load_data(n, data_path, exp_name):
    """Function to load in data as a function of sample size."""
    ftrain = data_path / f"{exp_name}_train_{n}.npy"
    ftest = data_path / f"{exp_name}_test.npy"

    dftrain = np.load(ftrain)
    dftest = np.load(ftest)

    X_train = dftrain[:, :-1]
    y_train = dftrain[:, -1]

    X_test = dftest[:, :-1]
    y_test = dftest[:, -1]

    return X_train, y_train, X_test, y_test

In [4]:
def test_rf(n, reps, n_estimators, exp_name):
    """Test traditional RF classifier"""

    preds = np.zeros((reps, 10000))
    acc = np.zeros(reps)
    for i in range(reps):

        X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

        clf = rerfClassifier(
            n_estimators=n_estimators, projection_matrix="Base", n_jobs=8
        )

        clf.fit(X_train, y_train)

        preds[i] = clf.predict(X_test)
        acc[i] = np.sum(preds[i] == y_test) / len(y_test)

    np.save(f"output_ali/rf_{exp_name}_preds_{n}.npy", preds)
    return acc


def test_rerf(n, reps, n_estimators, feature_combinations, max_features, exp_name):
    """Test SPORF rerf implemnetation."""
    preds = np.zeros((reps, 10000))
    acc = np.zeros(reps)
    for i in range(reps):

        X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

        clf = rerfClassifier(
            n_estimators=n_estimators,
            projection_matrix="RerF",
            feature_combinations=feature_combinations,
            max_features=max_features,
            n_jobs=8,
        )

        clf.fit(X_train, y_train)

        preds[i] = clf.predict(X_test)
        acc[i] = np.sum(preds[i] == y_test) / len(y_test)

    np.save(f"output_ali/rerf_{exp_name}_preds_" + str(n) + ".npy", preds)
    return acc


def test_cython_of(n, reps, n_estimators, feature_combinations, max_features, exp_name):
    """Test SPORF rerf implemnetation."""
    preds = np.zeros((reps, 10000))
    acc = np.zeros(reps)
    for i in range(reps):

        X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

        clf = ObliqueForestClassifier(
            n_estimators=n_estimators,
            feature_combinations=feature_combinations,
            max_features=max_features,
            n_jobs=8,
        )

        clf.fit(X_train, y_train)

        preds[i] = clf.predict(X_test)
        acc[i] = np.sum(preds[i] == y_test) / len(y_test)

    np.save(f"output_ali/cythonof_{exp_name}_preds_" + str(n) + ".npy", preds)
    return acc


def test_python_of(n, reps, n_estimators, feature_combinations, max_features, exp_name):
    """Test PySporf."""
    preds = np.zeros((reps, 10000))
    acc = np.zeros(reps)
    for i in range(reps):

        X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

        clf = PythonObliqueForestClassifier(
            n_estimators=n_estimators,
            feature_combinations=feature_combinations,
            max_features=max_features,
            n_jobs=8,
        )

        clf.fit(X_train, y_train)
        preds[i] = clf.predict(X_test)
        acc[i] = np.sum(preds[i] == y_test) / len(y_test)

    np.save(f"output_ali/of_{exp_name}_preds_" + str(n) + ".npy", preds)
    return acc

In [5]:
data_path = Path("./data/")

In [6]:
# How many samples to train on
n = 1000

# How many repetitions
reps = 3

# experiment name
exp_name = "sparse_parity"
# exp_name = 'orthant'

# Tree parameters
n_estimators = 100
feature_combinations = 2
max_features = 1.0

# Test Classification Performance

In [7]:
acc = test_python_of(n, reps, n_estimators, feature_combinations, max_features, exp_name)
print(acc)

KeyboardInterrupt: 

In [8]:
acc = test_rerf(n, reps, n_estimators, feature_combinations, max_features, exp_name)
print(acc)

NameError: name 'rfc' is not defined

In [None]:
acc = test_rf(n, reps, n_estimators, exp_name)
print(acc)

In [19]:
acc = test_cython_of(n, reps, n_estimators, feature_combinations, max_features, exp_name)
print(acc)

Inside build...
finished resizing...
splitter:  <oblique_forests.tree._oblique_splitter.ObliqueSplitter object at 0x7f73c0063be0>
Splitter initialized...
Got to nogil part.
Inside build...
finished resizing...
splitter:  <oblique_forests.tree._oblique_splitter.ObliqueSplitter object at 0x7f74140644c0>
Splitter initialized...
Got to nogil part.
Inside build...
finished resizing...
splitter:  <oblique_forests.tree._oblique_splitter.ObliqueSplitter object at 0x7f7408063e90>
Splitter initialized...
Got to nogil part.
Inside build...
finished resizing...
splitter:  <oblique_forests.tree._oblique_splitter.ObliqueSplitter object at 0x7f741c064110>
Splitter initialized...
Got to nogil part.
Inside build...
finished resizing...
splitter:  <oblique_forests.tree._oblique_splitter.ObliqueSplitter object at 0x7f73e80641a0>
Splitter initialized...
Got to nogil part.
Inside build...
finished resizing...
splitter:  <oblique_forests.tree._oblique_splitter.ObliqueSplitter object at 0x7f744c064180>
Split

ValueError: This DecisionTreeClassifier estimator requires y to be passed, but the target y is None.

# Test Actual Runtime

In [7]:
# keep track of a list of runtimes
n_list = collections.defaultdict(list)

In [8]:
clf = rerfClassifier(n_estimators=n_estimators, projection_matrix="Base", n_jobs=8)
rerf_clf = rerfClassifier(n_estimators=n_estimators, projection_matrix="RerF", n_jobs=8)
py_of_clf = PythonObliqueForestClassifier(
    n_estimators=n_estimators,
    feature_combinations=feature_combinations,
    max_features=max_features,
    n_jobs=8,
)
cy_of_clf = ObliqueForestClassifier(
    n_estimators=n_estimators,
    feature_combinations=feature_combinations,
    max_features=max_features,
    n_jobs=8,
)

## 1000 sample size

In [9]:
# run on higher sample size now
# How many samples to train on
n = 1000
X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

In [12]:
time = %timeit -n 1 -o clf.fit(X_train, y_train)
n_list["BaseRF"].append(np.mean(time.timings))

32.5 ms ± 2.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
time = %timeit -n 1 -o rerf_clf.fit(X_train, y_train)
n_list["ReRF-Sporf"].append(np.mean(time.timings))

36.9 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
time = %timeit -n 1 -o py_of_clf.fit(X_train, y_train)
n_list["Py-Sporf"].append(np.mean(time.timings))

19.4 s ± 319 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
# with pointer arrays (old)
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

311 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
# with pointer arrays
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

209 ms ± 4.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# with std::vectors
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

212 ms ± 9.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 5000 sample size

In [11]:
# run on higher sample size now
# How many samples to train on
n = 5000

In [12]:
X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

In [18]:
time = %timeit -n 1 -o clf.fit(X_train, y_train)
n_list["BaseRF"].append(np.mean(time.timings))

188 ms ± 12.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
time = %timeit -n 1 -o rerf_clf.fit(X_train, y_train)
n_list["ReRF-Sporf"].append(np.mean(time.timings))

178 ms ± 2.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
# with pointer arrays
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

582 ms ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
# with pointer arrays 
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

680 ms ± 6.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
# with std::vectors
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

579 ms ± 8.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 10000 sample size

In [14]:
# run on higher sample size now
# How many samples to train on
n = 10000

In [15]:
X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

In [23]:
time = %timeit -n 1 -o clf.fit(X_train, y_train)
n_list["BaseRF"].append(np.mean(time.timings))

371 ms ± 39.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
time = %timeit -n 1 -o rerf_clf.fit(X_train, y_train)
n_list["ReRF-Sporf"].append(np.mean(time.timings))

350 ms ± 7.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
# with pointer arrays
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

1.04 s ± 20.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
# with pointer arrays
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

1.18 s ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
# with pointer arrays (c++ std vector)
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

1.04 s ± 15.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
