In [1]:
import numpy as np
from scipy.stats import multiscale_graphcorr
from sklearn.preprocessing import OneHotEncoder
from scipy._lib._util import MapWrapper

In [136]:
def k_sample_test(X, y,score_func):
    k_array = np.unique(y)
    matrices = []
    for i in k_array:
        indices = np.where(y == i)[0] 
        xi = X[indices,:]
        matrices.append(xi)
    X = np.vstack(matrices)
    vs = []
    for i in range(len(np.unique(y))):
        n = matrices[i].shape[0]
        encode = np.zeros(shape=(n, len(matrices)))
        encode[:, i] = np.ones(shape=n)
        vs.append(encode)
    y = np.concatenate(vs)
    if score_func == "mgc":
        mgc = multiscale_graphcorr(X,y)
        stat = mgc.stat #mgc case
    else: 
        mgc = multiscale_graphcorr(X,y)
        stat = mgc.stat#default is mgc as no other tests in scipy
    print(stat)
    return(stat)

In [33]:
def fit_transform(X,y,k,score_func):
    features = X.shape[1] #features of matrix X(n samples by p features )
    best_features = []
    while (len(best_features) < k): # outerloop to obtain up to k features 
        j = 0 #sets inner loop equal to 0 at each iteration
        X_new = np.array(X) # establishes matrix of data as np array
        scores = []
        while(j< features): #inner loop to find the next best feature relative to features already obtained
            if len(best_features) == 0: #in case where we are obtaining first feature, we perform calculation in this way
                X_j =  X_new[:,j] #each feature from j to last feature 
                stat= k_sample_test(X_j,y,score_func) #multivariate independence test performed on each feature
                scores.append(stat)#stat obtained, in first feature case we select the best single feature
            else:
                if j not in best_features:
                    columns = best_features #construct array for indexing 
                    columns.append(j)
                    columns = np.sort(columns)
                    X_j = X_new[:,columns] #perform test with obtained features against every feature after to then obtain the best group of features with one additional feature 
                    stat= k_sample_test(X_j,y,score_func)
                    scores.append(stat)
            j =j +1
        sorted_index = np.argsort(scores)
        best = sorted_index[len(scores)-1] #find best of the scores
        best_features.append(best)
    return X_new[:,best_features] # obtain the first k feature columns as we have constructed those to be approximately the best k features as a group

In [48]:
def fit_transform_p(X,y,k,score_func,workers = -1): 
    features = range(X.shape[1])#features of matrix X(n samples by p features, creates an array of feature indexes
    best_features = []
    while (len(best_features) < k): # outerloop to obtain up to k features 
        X_new = np.array(X)# establishes matrix of data as np array
        parallel = _Parallel(X_new=X_new, y=y, score_func = score_func,best_features = best_features) #establishes parallel operation helper class object
        with MapWrapper(workers) as mapwrapper:
            scores = list(mapwrapper(parallel, features)) #maps in parallel the parallel operation that calcs score with the iterable features list to test, with best_features already obtained
        scores_index = np.zeros((len(features),2)) #temp array
        scores_index[0] = features #input features as first column, all features tested(exclude best_features)
        scores_index[1] = scores #input scores in second column
        sorted_index = scores_index[scores_index[:, 1].argsort()] #sort by scores column
        best = sorted_index[len(scores)-1,0] #find best of the scores
        best_features.append(best) #append new best feature column index 
        features.remove(best)
    return X_new[:,best_features] # obtain

In [45]:
class _Parallel:
    """Helper function to calculate parallel test value."""

    def __init__(self, X_new, y,score_func,best_features):
        self.X_new = X_new
        self.y = y
        self.score_func = score_func
        self.best_features = best_features

    def __call__(self, index):
        if len(best_features)==0:
            X_j =  self.X_new[:,index] #each feature from j to last feature 
            stat= k_sample_test(X_j,self.y,self.score_func)
        else:
            columns = self.best_features #construct array for indexing 
            columns.append(index)
            X_j = self.X_new[:,columns] #perform test with obtained features against every feature after to then obtain the best group of features with one additional feature 
            stat= k_sample_test(X_j,self.y,self.score_func)
            

        return stat

In [167]:
from numpy.testing import assert_almost_equal, assert_raises
from hyppo.ksample import KSample

def k_sample_equals_shuffle(X,y,score_func):
    X_shuffle = X[:, np.random.permutation(X.shape[1])]
    assert_almost_equal(k_sample_test(X,y,score_func), k_sample_test(X_shuffle,y,score_func), decimal=8)
#case k = 3
def k_sample_equals_k_matrix_K_Sample(X,y,score_func):
    k = len(np.unique(y))
    matrices = []
    i = 0
    while i <k:
        indices = np.where(y == i)[0] 
        xi = X[indices,:]
        matrices.append(xi)
        i = i + 1
    true_stat,true_pvalue,_ = KSample("MGC").test(matrices[0],matrices[1])
    assert_almost_equal(k_sample_test(X,y,score_func), true_stat, decimal=1)
 
#case k = 3
def k_sample_equals_k_matrix_K_Sample_2(X,y,score_func):
    k_array = np.unique(y)
    matrices = []
    for i in k_array:
        indices = np.where(y == i)[0] 
        xi = X[indices,:]
        matrices.append(xi)
    X = np.vstack(matrices)
    true_stat,true_pvalue,_ = KSample("MGC").test(matrices[0],matrices[1],matrices[2],matrices[3])
    print(true_stat)
    assert_almost_equal(k_sample_test(X,y,score_func), true_stat, decimal=8)
    

In [168]:
from hyppo.tools import rot_ksamp
# 1 feature
np.random.seed(123456789)
X, y,z,w = rot_ksamp("step", 100, p = 1, k=4,degree = [90,180,60], noise = False)
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

0.4367732824159251
0.43673638127084224


AssertionError: 
Arrays are not almost equal to 8 decimals
 ACTUAL: 0.43673638127084224
 DESIRED: 0.4367732824159251

In [162]:
from hyppo.tools import rot_ksamp
# 3 features
X, y,z,w = rot_ksamp("step", 100, p = 3, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

KeyboardInterrupt: 

In [156]:
from hyppo.tools import rot_ksamp
# 3 features
X, y,z,w = rot_ksamp("step", 100, p = 3, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

-0.0029437166179376827
-0.0029437166179380266


In [142]:
from hyppo.tools import rot_ksamp
# 10 features
X, y,z,w = rot_ksamp("step", 100, p = 10, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

-0.014673715675868006
-0.014673715675868768


In [143]:
from hyppo.tools import rot_ksamp
# 1 feature
X, y,z,w = rot_ksamp("joint_normal", 100, p = 1, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

0.021249720181672108
0.0212494872825279


AssertionError: 
Arrays are not almost equal to 8 decimals
 ACTUAL: 0.0212494872825279
 DESIRED: 0.021249720181672108

In [144]:
from hyppo.tools import rot_ksamp
# 3 features
X, y,z,w = rot_ksamp("joint_normal", 100, p = 3, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

-0.01133729553684774
-0.011337295536847564


In [145]:
from hyppo.tools import rot_ksamp
#10 features
X, y,z,w = rot_ksamp("joint_normal", 100, p = 10, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

-0.024555340604163197
-0.024555340604162704


In [146]:
from hyppo.tools import rot_ksamp
# 1 feature
X, y,z,w = rot_ksamp("linear", 100, p = 1, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

0.09272154581419821
0.09269377557127946


AssertionError: 
Arrays are not almost equal to 8 decimals
 ACTUAL: 0.09269377557127946
 DESIRED: 0.09272154581419821

In [147]:
from hyppo.tools import rot_ksamp
# 3 features
X, y,z,w = rot_ksamp("linear", 100, p = 3, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

0.02051560417538855
0.020528759567557953


AssertionError: 
Arrays are not almost equal to 8 decimals
 ACTUAL: 0.020528759567557953
 DESIRED: 0.02051560417538855

In [148]:
from hyppo.tools import rot_ksamp
# 10 features
X, y,z,w = rot_ksamp("linear", 100, p = 10, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

-0.014229205680157937
-0.014229205680158336


In [149]:
 from hyppo.tools import rot_ksamp
# 1 feature
X, y,z,w = rot_ksamp("spiral", 100, p = 1, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

0.04374273632886501
0.043742736328865


In [151]:
 from hyppo.tools import rot_ksamp
# 2 features
X, y,z,w = rot_ksamp("spiral", 100, p = 2, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

0.11057447867645621
0.11058383086975929


AssertionError: 
Arrays are not almost equal to 8 decimals
 ACTUAL: 0.11058383086975929
 DESIRED: 0.11057447867645621

In [150]:
 from hyppo.tools import rot_ksamp
# 5 features
X, y,z,w = rot_ksamp("spiral", 100, p = 5, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

0.042269097954361165
0.042503453160113


AssertionError: 
Arrays are not almost equal to 8 decimals
 ACTUAL: 0.042503453160113
 DESIRED: 0.042269097954361165

In [152]:
from hyppo.tools import rot_ksamp
# 10 features
X, y,z,w = rot_ksamp("spiral", 100, p = 10, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

-0.007395186167755928
-0.007395186167756051


In [153]:
from hyppo.tools import rot_ksamp
# 1 feature
X, y,z,w = rot_ksamp("square", 100, p = 1, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

-0.0033869681799878334
-0.003386968179987396


In [154]:
from hyppo.tools import rot_ksamp
#3 features 
X, y,z,w = rot_ksamp("square", 100, p = 3, k=4,degree = [90,180,60])
zeros = np.zeros(X.shape[0])
X = np.concatenate((X,y,z,w),axis = 0)
ones = np.ones(y.shape[0])
twos = 2*np.ones(z.shape[0])
threes = 3*np.ones(w.shape[0])
y = np.concatenate((zeros,ones,twos,threes),axis = 0)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

-0.010204751829171847
-0.01020475182917201


In [159]:
from sklearn.datasets import load_wine
#9 features 
X, y = load_wine(return_X_y=True)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")

0.5632754151709067
0.563275415170903


In [160]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# features = 4
X, y = load_iris(return_X_y=True)
k_sample_equals_k_matrix_K_Sample_2(X,y,"mgc")


0.8129670949672501
0.8143480944918309


AssertionError: 
Arrays are not almost equal to 8 decimals
 ACTUAL: 0.8143480944918309
 DESIRED: 0.8129670949672501