In [2]:
%load_ext Cython

In [14]:
%%cython --annotate

cimport numpy as np
import numpy as np
import warnings

from scipy.stats import multiscale_graphcorr
cpdef double k_sample_test(np.ndarray[double, ndim=2] X, np.ndarray[double, ndim=1] y,str score_func="mgc"):
    """Nonparametric `K`-Sample Testing test statistic.
     
    A k-sample test tests equality in distribution among groups. Groups
    can be of different sizes, but must have the same dimensionality.
    This implementation reduces the k-sample testing to an 
    independence testing problem, and leverages notable and powerful
    multivariate independence tests.
    
    Read more in the :ref:`User Guide <multivariate_feature_selection>`.
    
    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Sample vectors.
    y : ndarray of shape (n_samples,)
        The target vector.
    score_func : string that refers to a multivariate independence test from scipy
        The default and only existing test is multiscale graph correlation.
    
    Returns
    -------
    stat : float that refers to the computed k-sample test statistic
    
    Notes
    -----
    1. The k-sample testing problem can be thought of as a generalization of
    the two sample testing problem. 
    
    2. By manipulating the inputs of the k-sample test, we create
    concatenated versions of the inputs and a label matrix which are
    paired. Then, any multivariate nonparametric test can be performed on
    this data.
    
    3. Multivariate feature selection uses k-sample test score function to
    calculate a test statistic for each feature not already selected as a 
    best feature. For each feature in that sub-section, inputted is a data matrix 
    with best features selected and that additional feature.
    
    References
    ----------
    .. [1] Sambit Panda, Cencheng Shen, Ronan Perry, Jelle Zorn, Antoine Lutz, 
           Carey E. Priebe, and Joshua T. Vogelstein. Nonpar MANOVA via 
           Independence Testing. arXiv:1910.08883 [cs, stat], April 2021. 

    """
    # extract data matrix of shape (_samples,_features) for each group
    cdef double [:, :] X_view = X
    cdef double [:] y_view = y
    cdef list arr = []
    for i in range(y_view.size):
        if y_view[i] not in arr:
            arr.append(i)
    cdef double [:] unique_view = arr # unsure if needs to be rewritten in cython
    cdef list matrices = []
    cdef list indices = []
    cdef int shape = len(X_view.shape)
    cdef int [:] indices_view 
    cdef double [:,:] xi_view 
    for j in unique_view:
        indices_view = np.where(y == j)[0] 
        if shape == 1:
            xi_view = X_view.base[indices_view]
        else:
            xi_view = X_view.base[indices_view,:]
        matrices.append(xi_view)
    X_view = np.concatenate(matrices)
    # one hot encode y for multivariate independence test
    cdef list vs = []
    cdef int n
    cdef double [:, :] encode_view  
    for k in range(unique_view.size):
        n = matrices[k].shape[0]
        encode_view = np.zeros(shape=(n, len(matrices)))
        encode_view[:, j] = np.ones(shape=n)
        vs.append(encode_view)
    y_view = np.concatenate(vs) #unsure if needs to be rewritten cython
    
    # default, which is mgc case
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        mgc = multiscale_graphcorr(X_view,y_view,reps = 0) #unsure of cythonizing
    cdef double stat = mgc.stat 
    print(stat)
    return stat

In [15]:
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
k_sample_test(X,y)

ValueError: Buffer dtype mismatch, expected 'double' but got 'long'