In [1]:
# import 
import numpy as np
import random
from proglearn.sims import generate_gaussian_parity
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sc
import sklearn.ensemble
from sklearn import metrics 
import math
from scipy.stats import ttest_ind

In [2]:
# generate xor train and test data
X_train, y_train = generate_gaussian_parity(100, angle_params=0)
X_test, y_test = generate_gaussian_parity(10, angle_params=0)

# generate rxor, 45 degrees
X_test_rxor, y_test_rxor = generate_gaussian_parity(10, angle_params=np.pi/4)

In [3]:
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)

In [4]:
# fit the model using the train data 
clf.fit(X_train, y_train)

# predict using the test data
y_pred = clf.predict(X_test)

In [5]:
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.9


In [6]:
# predict on the rxor data
y_pred_rxor = clf.predict(X_test_rxor)

In [7]:
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test_rxor, y_pred_rxor))

ACCURACY OF THE MODEL:  0.7


In [8]:
# For each datapoint x in X and for each tree in the forest, return the index of the leaf x ends up in. xor
# 100 points for each data point since we have n_estimators = 100
leaf_indx_xor = clf.apply(X_test)

In [9]:
# rxor
leaf_idx_rxor = clf.apply(X_test_rxor)

In [10]:
# predict class probabilities for X, xor 
probas_xor = clf.predict_proba(X_test)

In [11]:
probas_xor

array([[0.84, 0.16],
       [0.02, 0.98],
       [0.06, 0.94],
       [0.15, 0.85],
       [0.09, 0.91],
       [0.95, 0.05],
       [0.11, 0.89],
       [0.75, 0.25],
       [0.64, 0.36],
       [0.79, 0.21]])

In [12]:
# rxor
probas_rxor = clf.predict_proba(X_test_rxor)

In [13]:
probas_rxor

array([[0.13, 0.87],
       [0.22, 0.78],
       [0.71, 0.29],
       [0.63, 0.37],
       [0.62, 0.38],
       [0.64, 0.36],
       [0.83, 0.17],
       [0.14, 0.86],
       [0.73, 0.27],
       [0.72, 0.28]])

In [14]:
ttest_ind(probas_xor, probas_rxor)

Ttest_indResult(statistic=array([-0.65874711,  0.65874711]), pvalue=array([0.51839675, 0.51839675]))

## Extract polytopes per label for X_test and X_test_rxor

In [26]:
# retuns polytopes per label, # of polytopes per label, and total samples per label as dictionary
# labels as keys
def getPolytopes(X, y):
    # get unique labels
    labels = np.unique(y)
    
    # dict to store polytopes per label
    polytope_label_dict = {}

    # dict to store total number of polytopes per label
    total_polytope_label_dict = {}

    # dict to store total samples per label
    total_sample_label_dict = {}

    for label in labels:
        X_ = X[np.where(y==label)[0]]
        # does this at an estimator level so it's an array with dims 1 x n_samples
        # when running apply without specifying estimator it runs it on all of them
        # so the resulting array is n_samples x n_estimators
        predicted_leaf_ids_across_trees = np.array(
            [tree.apply(X_) for tree in clf.estimators_]).T
        polytopes, polytope_count = np.unique(predicted_leaf_ids_across_trees, return_inverse=True, axis=0)

        # add the polytopes at each label to the dictionary
        polytope_label_dict[label] = polytopes

        total_polytopes_this_label = len(polytopes)
        total_polytope_label_dict[label] = total_polytopes_this_label
        print('total polytopes at label', label, ':', total_polytopes_this_label)

        total_samples_this_label = X_.shape[0]
        total_sample_label_dict[label] = total_samples_this_label
        print('total samples at label', label, ':', total_samples_this_label)
    
    return polytope_label_dict, total_polytope_label_dict, total_sample_label_dict

In [27]:
# # dict to store polytopes per label
# polytope_label_dict = {}

# # dict to store total number of polytopes per label
# total_polytope_label_dict = {}

# # dict to store total samples per label
# total_sample_label_dict = {}

polytope_label_dict, total_polytope_label_dict, total_sample_label_dict = getPolytopes(X_test, y_test)

total polytopes at label 0 : 4
total samples at label 0 : 4
total polytopes at label 1 : 6
total samples at label 1 : 6


In [28]:
polytope_label_dict

{0: array([[ 1, 12, 25,  8, 36,  3,  8,  2, 12, 18, 29,  6, 28, 41,  8,  3,
          3, 16,  7,  9, 29,  8,  7,  3,  5,  4,  5, 28, 28, 19, 38, 25,
         30, 18,  5, 37,  3, 18, 23,  3,  5,  1, 31,  7,  3, 17, 23,  3,
          5,  3, 19, 33,  5,  3, 39,  3,  4,  4, 23, 22, 12,  5,  5,  1,
         10, 19,  3,  7, 24,  3, 20, 29, 30,  4,  5,  2,  9, 15, 11,  7,
          4,  1,  8, 11,  5, 10,  1,  3, 27, 13, 16,  2, 17, 38,  3,  4,
          8, 10,  6, 35],
        [32,  4, 14, 30, 27, 14, 38, 31,  7, 10, 20, 13,  9, 17, 22, 18,
         13, 24, 29, 31, 34, 24, 25, 11, 43, 19, 27, 17, 11, 11, 14, 12,
         26, 13, 25,  9, 25,  8, 12, 24, 17, 25,  4, 11, 12,  8, 12,  9,
         19, 24, 13, 22, 25, 23, 14, 25, 18, 15, 13, 13, 30, 16, 31, 21,
         23,  6,  9, 43, 21, 15,  5, 18, 26, 19, 25, 23, 23, 30, 36, 24,
         30, 26, 27, 22, 30, 28, 32, 48, 14,  5,  6, 25, 13, 30, 11, 34,
         30, 16, 33, 17],
        [38, 21, 20, 30, 33, 22, 46, 31, 29, 27, 42, 45, 33, 30, 30, 

In [29]:
total_polytope_label_dict

{0: 4, 1: 6}

In [30]:
total_sample_label_dict

{0: 4, 1: 6}

## get polytope stats for rxor

In [31]:
polytope_label_dict_rxor, total_polytope_label_dict_rxor, total_sample_label_dict_rxor = getPolytopes(X_test_rxor, y_test_rxor)

total polytopes at label 0 : 8
total samples at label 0 : 8
total polytopes at label 1 : 2
total samples at label 1 : 2


In [32]:
polytope_label_dict_rxor

{0: array([[ 1, 11,  3,  7,  7,  2,  8,  2, 11, 17, 22,  4, 22, 23,  6,  2,
          2,  4,  5,  8, 16,  7,  6,  2,  4,  2,  4, 27, 15, 18, 23, 17,
          8, 17,  4, 14,  2, 13, 21,  2,  5,  1, 14,  5,  2, 15,  4,  2,
          4,  2,  3, 11,  5,  2, 27,  2,  3,  4, 17, 21,  9,  4,  4,  1,
          8,  3,  2,  4,  8,  2, 19,  7, 10,  3,  4,  2,  8, 10, 11,  4,
          3,  1,  4,  6,  5,  9,  1,  2, 11, 12, 12,  2,  9, 17,  3,  3,
          7,  7,  6, 22],
        [ 1, 12,  4,  8, 14,  2,  8,  2, 11, 17, 29,  4, 24, 23,  6,  3,
          3, 16,  6,  9, 24,  8,  7,  2,  5,  2,  5, 28, 15, 19, 23, 23,
          9, 17,  4, 15,  3, 13, 23,  2,  5,  1, 17,  7,  2, 15,  5,  3,
          5,  2, 17, 11,  5,  2, 35,  3,  3,  4, 23, 21,  9,  5,  4,  1,
         10,  3,  3,  4, 11,  3, 19,  7, 11,  4,  5,  2,  9, 15, 11,  4,
          4,  1,  5,  6,  5, 10,  1,  3, 11, 13, 14,  2, 11, 21,  3,  4,
          7, 10,  6, 31],
        [ 1, 12,  4,  8, 16,  2,  8,  2, 11, 18, 29,  4, 24, 23,  8, 

In [33]:
total_polytope_label_dict_rxor

{0: 8, 1: 2}

In [34]:
total_sample_label_dict_rxor

{0: 8, 1: 2}