In [150]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [222]:
%%cython --annotate

cimport numpy as np
import numpy as np
import warnings
from numpy cimport ndarray, int64_t, float64_t

ctypedef fused ordered:
    np.int32_t
    np.int64_t
    np.float32_t
    np.float64_t

from scipy.stats import multiscale_graphcorr
cpdef np.float64_t k_sample_test(float64_t [:, :] X_view , float64_t [:] y_view, str score_func = 'mgc'):
    """Nonparametric `K`-Sample Testing test statistic.
     
    A k-sample test tests equality in distribution among groups. Groups
    can be of different sizes, but must have the same dimensionality.
    This implementation reduces the k-sample testing to an 
    independence testing problem, and leverages notable and powerful
    multivariate independence tests.
    
    Read more in the :ref:`User Guide <multivariate_feature_selection>`.
    
    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Sample vectors.
    y : ndarray of shape (n_samples,)
        The target vector.
    score_func : string that refers to a multivariate independence test from scipy
        The default and only existing test is multiscale graph correlation.
    
    Returns
    -------
    stat : float that refers to the computed k-sample test statistic
    
    Notes
    -----
    1. The k-sample testing problem can be thought of as a generalization of
    the two sample testing problem. 
    
    2. By manipulating the inputs of the k-sample test, we create
    concatenated versions of the inputs and a label matrix which are
    paired. Then, any multivariate nonparametric test can be performed on
    this data.
    
    3. Multivariate feature selection uses k-sample test score function to
    calculate a test statistic for each feature not already selected as a 
    best feature. For each feature in that sub-section, inputted is a data matrix 
    with best features selected and that additional feature.
    
    References
    ----------
    .. [1] Sambit Panda, Cencheng Shen, Ronan Perry, Jelle Zorn, Antoine Lutz, 
           Carey E. Priebe, and Joshua T. Vogelstein. Nonpar MANOVA via 
           Independence Testing. arXiv:1910.08883 [cs, stat], April 2021. 

    """
    # extract data matrix of shape (_samples,_features) for each group

     # unsure if needs to be rewritten in cython
    cdef list matrices = []
    cdef int64_t shape = len(X_view.shape)
    cdef list indices_view  = []
    cdef float64_t [:,:] xi_view 
    cdef int64_t [:] unique_view = np.array(np.unique(y_view), dtype = ('int64'))
    cdef int64_t [:] y_int_view = np.array(y_view,dtype = np.int64)
    for j in unique_view:
        for i in y_int_view:
            if i == j:
                indices_view.append(i)
        if shape == 1:
            xi_view = X_view.base[indices_view]
        else:
            xi_view = X_view.base[indices_view,:]
        matrices.append(xi_view)
    X_view = np.concatenate(matrices)
    # one hot encode y for multivariate independence test
    cdef list vs = []
    cdef int64_t n
    cdef float64_t [:, :] encode_view  
    cdef int64_t classes = unique_view.size
    for k in np.arange(classes, dtype = np.int64):
        n = matrices[k].shape[0]
        encode_view = np.zeros(shape=(n, classes), dtype = np.float64)
        encode_view[:, k] = np.ones(shape=n, dtype = np.float64)
        vs.append(encode_view)
    y_view = np.concatenate(vs)
    # default, which is mgc case
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        mgc = multiscale_graphcorr(X_view,y_view,reps = 0) #unsure of cythonizing
    cdef float64_t stat = mgc.stat 
    return stat

In [223]:
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
k_sample_test(X.astype('float64'),y.astype('float64'))

TypeError: only size-1 arrays can be converted to Python scalars

Exception ignored in: '_cython_magic_b0733ecf45153bf2740c1cd325a161cc.k_sample_test'
TypeError: only size-1 arrays can be converted to Python scalars


0.0

In [211]:
from sklearn.base import BaseEstimator
from sklearn.feature_selection import SelectorMixin
import numpy as np
from scipy.stats import multiscale_graphcorr
from scipy.sparse import isspmatrix
import warnings
from sklearn.utils.validation import check_is_fitted
from joblib import Parallel, delayed

######################################################################
# Scoring function

# The following is a rewriting of hyppo.ksample.KSample
# from hyppo.neurodata.io
def k_sample_test_2(X, y,score_func="mgc"):
    """Nonparametric `K`-Sample Testing test statistic.
     
    A k-sample test tests equality in distribution among groups. Groups
    can be of different sizes, but must have the same dimensionality.
    This implementation reduces the k-sample testing to an 
    independence testing problem, and leverages notable and powerful
    multivariate independence tests.
    
    Read more in the :ref:`User Guide <multivariate_feature_selection>`.
    
    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Sample vectors.
    y : ndarray of shape (n_samples,)
        The target vector.
    score_func : string that refers to a multivariate independence test from scipy
        The default and only existing test is multiscale graph correlation.
    
    Returns
    -------
    stat : float that refers to the computed k-sample test statistic
    
    Notes
    -----
    1. The k-sample testing problem can be thought of as a generalization of
    the two sample testing problem. 
    
    2. By manipulating the inputs of the k-sample test, we create
    concatenated versions of the inputs and a label matrix which are
    paired. Then, any multivariate nonparametric test can be performed on
    this data.
    
    3. Multivariate feature selection uses k-sample test score function to
    calculate a test statistic for each feature not already selected as a 
    best feature. For each feature in that sub-section, inputted is a data matrix 
    with best features selected and that additional feature.
    
    References
    ----------
    .. [1] Sambit Panda, Cencheng Shen, Ronan Perry, Jelle Zorn, Antoine Lutz, 
           Carey E. Priebe, and Joshua T. Vogelstein. Nonpar MANOVA via 
           Independence Testing. arXiv:1910.08883 [cs, stat], April 2021. 

    """
    # extract data matrix of shape (_samples,_features) for each group
    k_array = np.unique(y)
    matrices = []
    for i in k_array:
        indices = np.where(y == i)[0] 
        if len(X.shape) == 1:
            xi = X[indices]
        else:
            xi = X[indices,:]
        matrices.append(xi)
    X = np.concatenate(matrices)
    # one hot encode y for multivariate independence test
    vs = []
    for i in range(len(np.unique(y))):
        n = matrices[i].shape[0]
        encode = np.zeros(shape=(n, len(matrices)))
        encode[:, i] = np.ones(shape=n)
        vs.append(encode)
    y = np.concatenate(vs)
    
    # default, which is mgc case
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        mgc = multiscale_graphcorr(X,y,reps = 0)
    stat = mgc.stat 
    return stat

from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
k_sample_test_2(X,y)

0.6621577707289195

[0 1 2 ... 8 9 8]


TypeError: only size-1 arrays can be converted to Python scalars