In [47]:
import numpy as np
from scipy.stats import multiscale_graphcorr
from sklearn.preprocessing import OneHotEncoder
from scipy._lib._util import MapWrapper

In [9]:
def k_sample_test(X, y,score_func):
    one_hot_enc = OneHotEncoder() #onehotencoding
    temp =  one_hot_enc.fit_transform(y.reshape(-1,1))
    y = temp.toarray()
    if score_func == "mgc":
        mgc = multiscale_graphcorr(X,y)
        stat = mgc.stat #mgc case
    else: 
        mgc = multiscale_graphcorr(X,y)
        stat = mgc.stat#default is mgc as no other tests in scipy
    return(stat)

In [33]:
def fit_transform(X,y,k,score_func):
    features = X.shape[1] #features of matrix X(n samples by p features )
    best_features = []
    while (len(best_features) < k): # outerloop to obtain up to k features 
        j = 0 #sets inner loop equal to 0 at each iteration
        X_new = np.array(X) # establishes matrix of data as np array
        scores = []
        while(j< features): #inner loop to find the next best feature relative to features already obtained
            if len(best_features) == 0: #in case where we are obtaining first feature, we perform calculation in this way
                X_j =  X_new[:,j] #each feature from j to last feature 
                stat= k_sample_test(X_j,y,score_func) #multivariate independence test performed on each feature
                scores.append(stat)#stat obtained, in first feature case we select the best single feature
            else:
                if j not in best_features:
                    columns = best_features #construct array for indexing 
                    columns.append(j)
                    columns = np.sort(columns)
                    X_j = X_new[:,columns] #perform test with obtained features against every feature after to then obtain the best group of features with one additional feature 
                    stat= k_sample_test(X_j,y,score_func)
                    scores.append(stat)
            j =j +1
        sorted_index = np.argsort(scores)
        best = sorted_index[len(scores)-1] #find best of the scores
        best_features.append(best)
    return X_new[:,best_features] # obtain the first k feature columns as we have constructed those to be approximately the best k features as a group

In [48]:
def fit_transform_p(X,y,k,score_func,workers = -1): 
    features = range(X.shape[1])#features of matrix X(n samples by p features, creates an array of feature indexes
    best_features = []
    while (len(best_features) < k): # outerloop to obtain up to k features 
        X_new = np.array(X)# establishes matrix of data as np array
        parallel = _Parallel(X_new=X_new, y=y, score_func = score_func,best_features = best_features) #establishes parallel operation helper class object
        with MapWrapper(workers) as mapwrapper:
            scores = list(mapwrapper(parallel, features)) #maps in parallel the parallel operation that calcs score with the iterable features list to test, with best_features already obtained
        scores_index = np.zeros((len(features),2)) #temp array
        scores_index[0] = features #input features as first column, all features tested(exclude best_features)
        scores_index[1] = scores #input scores in second column
        sorted_index = scores_index[scores_index[:, 1].argsort()] #sort by scores column
        best = sorted_index[len(scores)-1,0] #find best of the scores
        best_features.append(best) #append new best feature column index 
        features.remove(best)
    return X_new[:,best_features] # obtain

In [45]:
class _Parallel:
    """Helper function to calculate parallel test value."""

    def __init__(self, X_new, y,score_func,best_features):
        self.X_new = X_new
        self.y = y
        self.score_func = score_func
        self.best_features = best_features

    def __call__(self, index):
        if len(best_features)==0:
            X_j =  self.X_new[:,index] #each feature from j to last feature 
            stat= k_sample_test(X_j,self.y,self.score_func)
        else:
            columns = self.best_features #construct array for indexing 
            columns.append(index)
            X_j = self.X_new[:,columns] #perform test with obtained features against every feature after to then obtain the best group of features with one additional feature 
            stat= k_sample_test(X_j,self.y,self.score_func)
            

        return stat

In [31]:
from numpy.testing import assert_almost_equal, assert_raises
from hyppo.ksample import KSample

def k_sample_equals_shuffle(X,y,score_func):
    X_shuffle = X[:, np.random.permutation(X.shape[1])]
    assert_almost_equal(k_sample_test(X,y,score_func), k_sample_test(X_shuffle,y,score_func), decimal=1)
#case k = 3
def k_sample_equals_k_matrix_K_Sample(X,y,score_func):
    k = len(np.unique(y))
    matrices = []
    i = 0
    while i <k:
        indices = np.where(y == i)[0] 
        xi = X[indices,:]
        matrices.append(xi)
        i = i + 1
    true_stat,true_pvalue,_ = KSample("MGC").test(matrices[0],matrices[1],matrices[2])
    assert_almost_equal(k_sample_test(X,y,score_func), true_stat, decimal=1)
        

In [32]:
#Run test functions
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
print(k_sample_equals_shuffle(X,y,"mgc"))

print(k_sample_equals_k_matrix_K_Sample(X,y,"mgc"))

None
None
