In [131]:
import numpy as np
from scipy.stats import multiscale_graphcorr
from scipy._lib._util import MapWrapper
import warnings

In [213]:
def k_sample_test(X, y,score_func="mgc"):
    """Compute the multivariate independence test statisitic for the provided sample.
    Read more in the :ref:`User Guide <multivariate_feature_selection>`.
    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The set of regressors that will be tested sequentially.
    y : ndarray of shape (n_samples,)
        The target vector.
    score_func : string that refers to a k_sample multivariate independence test from scipy
                The default and only existing test is multiscale graph correlation.
    Returns
    -------
    stat : float that refers to the computed k_sample test statistic
    """
    k_array = np.unique(y)
    matrices = []
    for i in k_array:
        indices = np.where(y == i)[0] 
        if len(X.shape) == 1:
            xi = X[indices]
        else:
            xi = X[indices,:]
        matrices.append(xi)
    X = np.concatenate(matrices)
    vs = []
    for i in range(len(np.unique(y))):
        n = matrices[i].shape[0]
        encode = np.zeros(shape=(n, len(matrices)))
        encode[:, i] = np.ones(shape=n)
        vs.append(encode)
    y = np.concatenate(vs)
    warnings.filterwarnings("ignore")
    mgc = multiscale_graphcorr(X,y,reps = 0)
    stat = mgc.stat #mgc case
    return(stat)

In [214]:
def fit_transform(X,y,k,workers = -1): 
    features = np.arange(X.shape[1])#features of matrix X(n samples by p features, creates an array of feature indexes
    best_features = []
    while (len(best_features) < k): # outerloop to obtain up to k features 
        X_new = np.array(X)# establishes matrix of data as np array
        parallel = _Parallel(X_new=X_new, y=y,best_features = best_features)#establishes parallel operation helper class object
        with MapWrapper(workers) as mapwrapper:
            scores = list(mapwrapper(parallel, features)) #maps in parallel the parallel operation that calcs score with the iterable features list to test, with best_features already obtained
        scores_index = np.zeros((len(features),2)) #temp array
        scores_index[:,0] = features #input features as first column, all features tested(exclude best_features)
        scores_index[:,1] = scores #input scores in second column
        sorted_index = scores_index[scores_index[:, 1].argsort()] #sort by scores column
        best = sorted_index[len(scores)-1,0] #find best of the scores
        best_features.append(int(best)) #append new best feature column index 
        features = np.delete(features,np.where(features == best))
    return X_new[:,best_features] # obtain

In [215]:
class _Parallel:
    """Helper function to calculate parallel test value."""

    def __init__(self, X_new, y,best_features):
        self.X_new = X_new
        self.y = y
        self.best_features = best_features

    def __call__(self, index):
        if np.var(self.X_new[:,index]) == 0:
            stat = -1000.0
        else:   
            if len(self.best_features)==0:
                X_j =  self.X_new[:,index] #each feature from j to last feature 
                stat= k_sample_test(X_j,self.y)
            else:
                columns = self.best_features #construct array for indexing 
                columns.append(index)
                X_j = self.X_new[:,columns]#perform test with obtained features against every feature after to then obtain the best group of features with one additional feature 
                stat= k_sample_test(X_j,self.y)
        return stat

In [137]:
# test on dataset from last sem in CMM, high amount of dimensions, low samples select very few features, time about 5 mins
import pandas as pd
expr_data = pd.read_csv('expr.txt', sep= "\t", header='infer')
pheno_data = pd.read_csv('pheno.txt', sep= "\t", header='infer')
pheno_data['RelapseGroup_new'] = np.zeros(pheno_data.shape[0])
i = 0
while (i < pheno_data.shape[0]):
    if pheno_data['RelapseGroup'][i] == 'Relapse':
        pheno_data['RelapseGroup_new'][i] = 1
        i = i + 1
    else:
        pheno_data['RelapseGroup_new'][i] = 0
        i = i + 1
expr_data = expr_data.T
expr_data.drop('GENE', axis = 0, inplace=True)
expr_data = expr_data.astype(float)
expr_data['Phenotype'] = np.zeros(pheno_data.shape[0])
pheno_data.set_index('GEO_ACCESSION',drop = True, inplace = True)
phenotype = pheno_data.loc[:,'RelapseGroup_new']
expr_data.index.name = 'GEO_ACCESSION'
expr_data['Phenotype'] = phenotype
data = np.array(expr_data)
X = data[:,0:22215]
y = data[:,22215]
fit_transform(X,y,3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


array([[13.5745, 12.5243, 14.6717],
       [13.2476, 11.8767, 14.715 ],
       [12.8502, 11.3969, 14.1845],
       [13.0014, 11.621 , 13.6435],
       [12.9124, 11.8306, 15.0428],
       [13.0333, 11.9655, 14.4929],
       [12.2114, 11.3213, 14.0294],
       [12.6004, 10.997 , 14.8682],
       [12.4893, 11.6024, 14.455 ],
       [12.3688, 11.4932, 14.6224],
       [12.8225, 11.8076, 14.5764],
       [11.7899, 11.4094, 14.3924],
       [12.2337, 11.2851, 14.2587],
       [11.7723, 11.5346, 14.0707],
       [13.0272, 11.7531, 14.5536],
       [12.8708, 11.7705, 14.2633],
       [12.4727, 11.1859, 14.0748],
       [12.1196, 12.1531, 13.7019],
       [13.0948, 11.9245, 14.9422],
       [12.2289, 10.9051, 14.3723],
       [12.4988, 11.6073, 14.7345],
       [11.858 , 10.7416, 14.3773],
       [13.047 , 11.3527, 14.1223],
       [12.8655, 11.9285, 13.8704],
       [13.041 , 11.35  , 15.072 ],
       [12.9453, 11.5694, 14.1182],
       [13.0688, 10.9557, 13.8358],
       [12.9483, 11.5178, 14

In [216]:
# test on sklearn dataset, medium-high samples, medium-low features, select like a third of features, 5 mins to run
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
fit_transform(X,y,20)

array([[ 5.,  1.,  8., ..., 10.,  0.,  0.],
       [ 0., 16.,  0., ..., 16., 16., 16.],
       [ 1.,  5.,  0., ..., 11., 13., 15.],
       ...,
       [ 0., 16.,  5., ..., 13., 16., 16.],
       [ 0.,  0.,  7., ..., 16., 14., 14.],
       [ 0.,  4.,  0., ..., 14., 15., 15.]])