In [18]:
# import 
import numpy as np
import random
from proglearn.sims import generate_gaussian_parity
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sc
import sklearn.ensemble
from sklearn import metrics 
import math
from scipy.stats import ttest_ind
from scipy.spatial import distance
import random
import pandas as pd

In [2]:
# number of samples
n_samples = 100

# generate xor 
X_xor, y_xor = generate_gaussian_parity(n_samples, angle_params=0)

# generate rxor, 45 degrees
X_rxor, y_rxor = generate_gaussian_parity(n_samples, angle_params=np.pi/4)

In [3]:
# we want to pick 70 samples from xor/rxor to train trees so we need to first subset each into arrays with only xor_0/1 and rxor_0/1
X_xor_0 = X_xor[np.where(y_xor == 0)]
X_xor_1 = X_xor[np.where(y_xor == 1)]

X_rxor_0 = X_rxor[np.where(y_rxor == 0)]
X_rxor_1 = X_rxor[np.where(y_rxor == 1)]

# we can concat the first 35 samples from each pair to use to tatal 70 samples for training and 30 for predict proba
X_xor_train = np.concatenate((X_xor_0[0:35], X_xor_1[0:35]))
y_xor_train = np.concatenate((np.zeros(35), np.ones(35)))

# repeat for rxor
X_rxor_train = np.concatenate((X_rxor_0[0:35], X_rxor_1[0:35]))
y_rxor_train = np.concatenate((np.zeros(35), np.ones(35)))

In [4]:
len(X_xor_train), len(X_xor_train)

(70, 70)

In [5]:
# init the rf's
# xor rf
clf_xor = sklearn.ensemble.RandomForestClassifier(n_estimators=10, min_samples_leaf=int(n_samples/7))

# rxor rf
clf_rxor = sklearn.ensemble.RandomForestClassifier(n_estimators=10, min_samples_leaf=int(n_samples/7))

In [6]:
# train rfs
# fit the model using the train data 
clf_xor.fit(X_xor_train, y_xor_train)

# fit rxor model
clf_rxor.fit(X_rxor_train, y_rxor_train)

RandomForestClassifier(min_samples_leaf=14, n_estimators=10)

In [7]:
# concat 30 test samples for both xor and rxor
# xor test
X_xor_test = np.concatenate((X_xor_0[35:], X_xor_1[35:]))

# rxor test 
X_rxor_test = np.concatenate((X_rxor_0[35:], X_rxor_1[35:]))

# predict proba with xor rf
xor_proba_xorRF = clf_xor.predict_proba(X_xor_test)

# this is wrong and needs to be changed
# rxor_proba_xorRF = clf_xor.predict_proba(X_rxor_test)

# predict proba with rxor rf
xor_proba_rxorRF = clf_rxor.predict_proba(X_xor_test)

# this is wrong and needs to be changed
# rxor_proba_rxorRF = clf_rxor.predict_proba(X_rxor_test)

In [10]:
X_rxor_train.shape

(70, 2)

In [101]:
# push rxor samples through xor tree
rxorSamples_xorRf_leaves = clf_xor.apply(X_rxor_train)

# push xor samples through rxor tree
xorSamples_rxorRf_leaves = clf_rxor.apply(X_xor_train)

In [102]:
# convert both to df
# rxor train samples pushed through xor rf
df_rxorSamples_xorRf_leaves = pd.DataFrame(rxorSamples_xorRf_leaves)

# xor train samples pushed through rxor rf
df_xorSamples_rxorRf_leaves = pd.DataFrame(xorSamples_rxorRf_leaves)

In [94]:
# function get % of samples at each label for each polytope (xor or rxor)
def percentLabels(leaves, true_labels):
    '''
    Take df of leaves as input for mult estimators. 
    true_labels = true labels from generate_gaussian_parity()
    Returns dictionary (keys = estimator) of dictionaries (keys = unique leaf id in estimator, value = % chance lable 0, % change label 1 at key)
    '''
    # dict to hold the %'s 
    # polytope as key, value = [% samples at this polytope with label 0, % samples at this polytope with label 1]
    perc_labels_at_poly = {}
    
    # each col is in estimator 
    for col in leaves:
        # to handle adding multiple uni's
        first_uni = True
        for uni in np.unique(leaves[col]):

            # idx of each unique polytope 
            poly_to_test = np.where(leaves[col] == uni)[0]
            num_0 = 0
            num_1 = 0

            # sum the number of each label at each poly/leaf
            for i in poly_to_test:
                if true_labels[i] == 0: 
                    num_0+=1
                else:
                    num_1+=1

            # calc % of each label at each polytope/leaf
            total_samples_at_poly = num_0 + num_1
            perc_0 = num_0 / total_samples_at_poly
            perc_1 = num_1 / total_samples_at_poly
            
            # if its the first uni we add to dict
            if first_uni:
                perc_labels_at_poly[col] = {uni:[perc_0, perc_1]}
                first_uni = False
                
            # if its not the first uni in the col we need to use update to not overwrite the first uni
            else:
                perc_labels_at_poly[col].update({uni:[perc_0, perc_1]})
                
    return perc_labels_at_poly

In [121]:
# get chance of being 0 or 1 for each unique label for each estimator
# test rxor data pushed through xor rf
rxorSamples_xorRf_label_percs = percentLabels(df_rxorSamples_xorRf_leaves, y_rxor_train)

# test xor data pushed through rxor rf
xorSamples_rxorRf_leaves_percs = percentLabels(df_xorSamples_rxorRf_leaves, y_xor_train)

In [96]:
rxorSamples_xorRf_label_percs

{0: {1: [0.85, 0.15],
  3: [0.08823529411764706, 0.9117647058823529],
  4: [0.9375, 0.0625]},
 1: {1: [0.13636363636363635, 0.8636363636363636],
  2: [0.6666666666666666, 0.3333333333333333]},
 2: {1: [0.38461538461538464, 0.6153846153846154],
  2: [0.8333333333333334, 0.16666666666666666]},
 3: {2: [0.6666666666666666, 0.3333333333333333],
  3: [0.5172413793103449, 0.4827586206896552],
  4: [0.23529411764705882, 0.7647058823529411]},
 4: {1: [0.13636363636363635, 0.8636363636363636],
  2: [0.6666666666666666, 0.3333333333333333]},
 5: {1: [0.4166666666666667, 0.5833333333333334],
  2: [0.6818181818181818, 0.3181818181818182]},
 6: {1: [0.20833333333333334, 0.7916666666666666],
  2: [0.6521739130434783, 0.34782608695652173]},
 7: {1: [0.09523809523809523, 0.9047619047619048],
  2: [0.673469387755102, 0.32653061224489793]},
 8: {1: [0.05263157894736842, 0.9473684210526315],
  3: [0.6896551724137931, 0.3103448275862069],
  4: [0.6363636363636364, 0.36363636363636365]},
 9: {1: [0.1363636

In [122]:
xorSamples_rxorRf_leaves_percs

{0: {2: [0.4444444444444444, 0.5555555555555556],
  3: [0.4, 0.6],
  4: [0.6071428571428571, 0.39285714285714285]},
 1: {1: [0.4444444444444444, 0.5555555555555556],
  2: [0.5348837209302325, 0.46511627906976744]},
 2: {1: [0.625, 0.375],
  3: [0.7058823529411765, 0.29411764705882354],
  4: [0.14285714285714285, 0.8571428571428571]},
 3: {1: [0.4444444444444444, 0.5555555555555556],
  3: [0.4444444444444444, 0.5555555555555556],
  4: [0.5588235294117647, 0.4411764705882353]},
 4: {1: [0.625, 0.375],
  3: [0.7058823529411765, 0.29411764705882354],
  4: [0.14285714285714285, 0.8571428571428571]},
 5: {1: [0.625, 0.375],
  3: [0.7058823529411765, 0.29411764705882354],
  4: [0.14285714285714285, 0.8571428571428571]},
 6: {1: [0.6111111111111112, 0.3888888888888889],
  2: [0.38235294117647056, 0.6176470588235294]},
 7: {1: [0.625, 0.375],
  3: [0.7058823529411765, 0.29411764705882354],
  4: [0.14285714285714285, 0.8571428571428571]},
 8: {2: [0.6285714285714286, 0.37142857142857144],
  3: [

In [None]:
# # gets first item in each sublist 
# def Extract(lst):
#     return list(next(zip(*lst)))

In [118]:
# push rxor test data through xor rf to get leaf idx's
rxor_test_xorRF_leaves = clf_xor.apply(X_rxor_test)

# push xor test data through rxor rf to get leaf idx's
xor_test_rxorRF_leaves = clf_rxor.apply(X_xor_test)

In [119]:
def predict(test_leaves, proba_dict):
    '''
    test_leaves = leaf indices of the test data
    proba_dict = output from predictLabels
    Returns predicted labels for held out test data.
    '''
    # array to hold predicted lables
    labels = []
    
    # iterate through each sample 
    for sample_nodes in test_leaves:
        # count whether estimator classifies as 0 or 1
        zeros = 0
        ones = 0
        
        # iterate through each estimator for each sample
        for sample_node_at_est, est in zip(sample_nodes, proba_dict.keys()):
            # add 1 to counter for what proba is greater
            if proba_dict[est][sample_node_at_est][0] > proba_dict[est][sample_node_at_est][1]: zeros+=1
            else: ones+=1
            
        # classify the sample as 0 or 1 based on all estimators 
        # 3 handles testing cases where 0=1, used for testing 
        if zeros > ones: labels.append(0)
        elif ones > zeros: labels.append(1)
        else: labels.append(3)
        
    return labels

In [123]:
# predict lables for our test data rxor in xor rf
y_rxor_test_samples_xorRF = predict(rxor_test_xorRF_leaves, rxorSamples_xorRf_label_percs)

# predict labels for our test data xor in rxor rf
y_xor_test_samples_rxorRF = predict(xor_test_rxorRF_leaves, xorSamples_rxorRf_leaves_percs)

In [125]:
y_rxor_test_samples_xorRF, len(y_rxor_test_samples_xorRF)

([0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  0],
 30)

In [126]:
y_xor_test_samples_rxorRF, len(y_xor_test_samples_rxorRF)

([1,
  1,
  0,
  0,
  1,
  0,
  3,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0],
 30)