In [1]:
# import 
import numpy as np
import random
from proglearn.sims import generate_gaussian_parity
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sc
import sklearn.ensemble
from sklearn import metrics 
import math
from scipy.stats import ttest_ind
from scipy.spatial import distance

In [2]:
# number of samples
n_samples = 100

# generate xor 
X_xor, y_xor = generate_gaussian_parity(n_samples, angle_params=0)

# generate rxor, 45 degrees
X_rxor, y_rxor = generate_gaussian_parity(n_samples, angle_params=np.pi/4)

In [3]:
# xor rf
clf_xor = sklearn.ensemble.RandomForestClassifier(n_estimators=1, min_samples_leaf=int(n_samples/10))

# rxor rf
clf_rxor = sklearn.ensemble.RandomForestClassifier(n_estimators=1, min_samples_leaf=int(n_samples/10))

In [4]:
# fit the model using the train data 
clf_xor.fit(X_xor, y_xor)

# fit rxor model
clf_rxor.fit(X_rxor, y_rxor)

RandomForestClassifier(min_samples_leaf=10, n_estimators=1)

In [5]:
# index of each array that each sample ends up in
xor_leaves_xor_rf = clf_xor.apply(X_xor)
rxor_leaves_xor_rf = clf_xor.apply(X_rxor)

# repeat for rxor rf
xor_leaves_rxor_rf = clf_rxor.apply(X_xor)
rxor_leaves_rxor_rf = clf_rxor.apply(X_rxor)

In [6]:
# function get % of samples at each label for each polytope (xor or rxor)
def percentLabels(leaves, true_labels):
    # dict to hold the %'s 
    # polytope as key, value = [% samples at this polytope with label 0, % samples at this polytope with label 1]
    perc_labels_at_poly = {}
    
    for uni in np.unique(leaves):
        # idx of each unique polytope 
        poly_to_test = np.where(leaves == uni)[0]
        num_0 = 0
        num_1 = 0

        # sum the number of each label at each poly/leaf
        for i in poly_to_test:
            if true_labels[i] == 0: 
                num_0+=1
            else:
                num_1+=1

        # calc % of each label at each polytope/leaf
        total_samples_at_poly = num_0 + num_1
        perc_0 = num_0 / total_samples_at_poly
        perc_1 = num_1 / total_samples_at_poly

        perc_labels_at_poly[uni] = [perc_0, perc_1]
        
    return perc_labels_at_poly

In [7]:
# get % of samples in each label for each polytope for xor rf
perc_labels_at_poly_xor_xor_rf = percentLabels(xor_leaves_xor_rf, y_xor)
perc_labels_at_poly_rxor_xor_rf = percentLabels(rxor_leaves_xor_rf, y_rxor)

# repeat for rxor rf
perc_labels_at_poly_xor_rxor_rf = percentLabels(xor_leaves_rxor_rf, y_xor)
perc_labels_at_poly_rxor_rxor_rf = percentLabels(rxor_leaves_rxor_rf, y_rxor)

In [8]:
# calc l2 distance using the first percent for each polytope xor vs rxor 
def calcL2(xor_poly_percs, rxor_poly_percs):
    xor_label0_percs = []
    rxor_label0_percs = []
    
    # get each label 0 percent into a list for l2 calculation
    for key in xor_poly_percs.keys():
        xor_label0_percs.append(xor_poly_percs[key][0])
        rxor_label0_percs.append(rxor_poly_percs[key][0])
        
    
    return distance.euclidean(xor_label0_percs, rxor_label0_percs)

In [11]:
# calc l2's
xor_rf_l2 = calcL2(perc_labels_at_poly_xor_xor_rf, perc_labels_at_poly_rxor_xor_rf)
rxor_rf_l2 = calcL2(perc_labels_at_poly_xor_rxor_rf, perc_labels_at_poly_rxor_rxor_rf)

In [12]:
print('l2 distance between xor and rxor with xor rf:', xor_rf_l2)
print('l2 distance between xor and rxor with rxor rf', rxor_rf_l2)

l2 distance between xor and rxor with xor rf: 1.4053042857359253
l2 distance between xor and rxor with rxor rf 1.1062146160273891
