In [44]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.cross_decomposition import PLSCanonical
import random
import pandas as pd
from sys import exit
from sklearn import metrics
import matplotlib.pyplot as plt
import csv
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import cPickle as pickle
import time
import json
import sys
import time
import copy


# USUAL Classifier    
def simple_classifier(X, Y, num_samples, mean,deviation, ddof, method, feature_selection, model_params,
               num_features, step, verbose_train):
    model = PLSRegression(n_components=model_params['n_comp'], scale=False) 
    model_temp = copy.copy(model)
    model_temp.fit(X, Y)
    rfe = RFE(estimator=model, n_features_to_select=num_features, step=step)
    fit = rfe.fit(X, Y)
    features = fit.support_
    X_new = X[:, features]
    model.fit(X_new, Y) #ACTUALLY getting the classifier model, fit model to data
    pred = model.predict(X_new)
    coeff = model.coef_
    diff = Y - pred
    train_error = sum(diff*diff) #calculate square error
    train_error_temp = mean_squared_error(y_true=Y, y_pred=pred)
    #fpr, tpr, auc = get_roc_auc(labels=Y_temp, predictions=pred_temp)
    auc = 1.0
    return features, model, train_error, auc, coeff
    
def classifier(X, Y, num_samples, mean,deviation, ddof, method, feature_selection, model_params,
               num_features, step, verbose_train):
    
    #choose, which classifier to use
    if method== 'rf':#build a generic Random Forest
        model = RandomForestRegressor(n_jobs=12, n_estimators=model_params['n_trees'], random_state=0, 
                                      max_features=model_params['max_features'], max_depth=model_params['max_depth'], 
                                      min_samples_leaf=model_params['min_samples_leaf'])
    if method == 'pls':
        model = PLSRegression(n_components=model_params['n_comp'], scale=False) #initialize a generic PLS model with parameters
        
    if method == 'svm':
        model = SVR(C= model_params['C'], epsilon=model_params['epsilon'], kernel=model_params['kernel'], 
                   gamma = model_params['gamma'], degree=model_params['degree'])
        
    #choose a method for feature selection
    if feature_selection == 'FromModel':
        model_temp = model
        model_temp.fit(X, Y)
        model_temp = SelectFromModel(model, prefit=True)
        features = model_temp.get_support()
    elif feature_selection == 'rfecv':
        rfe = RFECV(n_jobs=12, estimator=model, cv=20, step=step)
        fit = rfe.fit(X, Y)
        features = fit.support_
    elif feature_selection == 'rfe':
        model_temp = model
        model_temp.fit(X, Y)
        rfe = RFE(estimator=model, n_features_to_select=num_features, step=step)
        fit = rfe.fit(X, Y)
        features = fit.support_
    elif feature_selection is None:
        features = [True for f in range(X.shape[1])]
    X_new = X[:, features]
    model.fit(X_new, Y) #ACTUALLY getting the classifier model, fit model to data
    pred = model.predict(X_new) #predict the values on the train set
    if method == 'rf':
        coeff = model.feature_importances_.flatten()
    elif method == 'pls' or method == 'svm':
        coeff = model.coef_.flatten()
    #process the predictions
    pred_temp = pred.flatten()
    pred_temp = descale_data(matrix=pred_temp, deviation=deviation, ddof=ddof ) #descale
    pred_temp = pred_temp+mean #add mean
    
    #process the labels
    Y_temp = np.array(Y)
    Y_temp = descale_data(matrix=Y_temp, deviation=deviation, ddof=ddof) #descale
    Y_temp = Y_temp+mean                             #add mean
    #for j in range(len(Y_temp)):
    #    if Y_temp[j]>1:
    #        Y_temp[j]=1 
    #    elif Y_temp[j]<0:
    #         Y_temp[j]=0
    #Y_temp=Y_temp.astype(int)
    
    pred_temp = np.rint(pred_temp)
    
    diff = Y_temp - pred_temp
    train_error = sum(diff*diff) #calculate square error
    train_error_temp = mean_squared_error(y_true=Y_temp, y_pred=pred_temp)
    fpr, tpr, auc = get_roc_auc(labels=Y_temp, predictions=pred_temp)#AUC if needed
    
    if verbose_train:
        print "Training error: ", train_error/num_samples
        print "Training error computed in library: ", train_error_temp
        print "Training auc: ", auc
        plot_roc_curve(fpr, tpr, auc)
    
    return features, model, train_error, auc, coeff

In [45]:
def center_data(matrix, mean=None):
    if mean is None:
        matrix = matrix - np.mean(a=matrix, axis=0)
    else:
        matrix = matrix - mean
    return matrix

In [46]:
def scale_data(matrix, deviation, ddof):
    if deviation is None:
        deviation= np.std(a=matrix, axis=0, ddof=ddof)
    matrix_temp = matrix.copy()
    matrix = matrix/deviation
    if np.isnan(matrix).any():
        matrix = matrix_temp
    return matrix

In [47]:
def descale_data(matrix, deviation, ddof):
    if deviation is None:
        matrix = matrix*np.std(a=matrix, axis=0, ddof=ddof)
    else:
        matrix = matrix*deviation
    return matrix

In [48]:
def get_roc_auc(labels, predictions):
    #compute precision-recall curve
    fpr, tpr, thresholds = metrics.roc_curve(y_true=labels, y_score=predictions, drop_intermediate=False) 
    #compute area under the curve for this run
    auc = metrics.roc_auc_score(y_true=labels, y_score=predictions, average='macro', sample_weight=None)
    return fpr, tpr, auc

In [49]:
def plot_roc_curve(fpr, tpr, auc):
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [50]:
#MULTILEVEL PLS
def perform_multilevel_pls(X, Y, subj, unique_subj, num_unique_subj, num_subj, scaling, par, ddof, method, 
                           feature_selection, num_features, step, model_params, verbose_train):
    
    #mean-centering
    X_centered = center_data(matrix=X, mean=None) #mean centering of data
    Y_centered = center_data(matrix=Y, mean=None) 

    #split matrix into between and within subject variations
    if par!='all':
        Xb, Xw = split_between_within_subject_matrix(X=X_centered, subj=subj, unique_subj=unique_subj, 
                                                 num_unique_subj=num_unique_subj, num_subj=num_subj)
    else:
        Xb = None
        Xw = None
    
    #scaling (if necessary and specified)
    if scaling: #scale the data, if the flag is true
        X_scaled = scale_data(matrix=X_centered, deviation=None, ddof=ddof)
        Y_scaled = scale_data(matrix=Y_centered, deviation=None, ddof=ddof)                          
        if par!= 'all':
            Xb_scaled = scale_data(matrix=Xb, deviation=None, ddof=ddof)
            Xw_scaled = scale_data(matrix=Xw, deviation=None, ddof=ddof)
    else:
        X_scaled = X_centered
        Y_scaled = Y_centered
        if par!= 'all':
            Xb_scaled = Xb
            Xw_scaled = Xw
    
    #which matrix are we interested in
    if par=='all':
        X_target = X_scaled
    elif par=='between':
        X_target = Xb_scaled
    elif par=='within':
        X_target = Xw_scaled
    
    start = time.time()
    #perform classifier (PLS or other) on target data
    features, model, train_error, train_auc, coeff = simple_classifier(X=X_target, Y=Y_scaled, num_samples=num_subj,
                                                           mean=np.mean(Y), deviation= np.std(Y, axis=0, ddof=ddof), 
                                                           ddof=ddof, method=method, 
                                                           feature_selection=feature_selection, 
                                                           num_features=num_features, step=step, 
                                                           model_params=model_params, verbose_train=verbose_train)
    end = time.time()
    print "Time elapsed for simple classifier: ", (end - start)
    return features, model, train_error, train_auc, Xb, Xw, Y_scaled, X_scaled, coeff

In [51]:
def separate_train_test(X, Y, subj, train_subj, test_subj, num_subj):
    
    X_train = None #initialization
    X_test = None
    Y_train = []
    Y_test = []
    subj_train = []
    subj_test = []  
    
    #create the test and train dataset from matrix X with chosen subjects
    mask = np.isin(subj,train_subj)
    inverse_mask = np.invert(mask)

    subj_train = subj[mask]
    subj_test = subj[inverse_mask]
    X_train = X[mask]
    Y_train = Y[mask]
    X_test = X[inverse_mask]
    Y_test = Y[inverse_mask]

    num_train_subj = len(subj_train)          #how many entries in train dataset
    num_test_subj= num_subj - num_train_subj  #how many entries in test dataset
        
    return X_train, X_test, Y_train, Y_test, subj_train, subj_test, num_train_subj, num_test_subj

In [52]:
# FULL SCRIPT
def full_script(num_folds, num_repeats, scaling, num_permutations, par, filename, verbose, ddof, method, 
                feature_selection,num_features,step, model_params,verbose_train, mean):
    
    #READ OR MAKE UP DATA
    X, Y, subj, IDs, NetCalc, frQ = read_data(filename, mean)
    
    #create list of unique subjects
    unique_subj = np.unique(subj) 
    #the number of entries = number of subjects corresponding to entries (1 subject per entry)
    num_subj = len(subj)    
    #the number of unique subjects
    num_unique_subj = len(unique_subj) 
    
    
    #initialization of error
    error = 0 
    permutation_error=None
    permutation_auc=None 
    permutation_Q=None
    crossval_error=None
    crossval_auc=None
    crossval_Q=None
    full_train_error=None
    full_train_auc=None
    crossval_error=None
    crossval_auc=None
    crossval_Q=None
    crossval_train_err=None
    crossval_train_auc=None
    

    #PERFORMING MULTILEVEL PLS ON WHOLE DATASET
    start = time.time()


    features, full_model, full_train_error, full_train_auc, Xb, Xw, Y_scaled, X_scaled, coeff = \
    perform_multilevel_pls(X=X, Y=Y, subj=subj, unique_subj=unique_subj, num_subj=num_subj, 
                           num_unique_subj=num_unique_subj, scaling=scaling, par=par, 
                           ddof=ddof, method=method, feature_selection=feature_selection, 
                           num_features=num_features, step=step, model_params=model_params, verbose_train=verbose_train)
    end = time.time()
    print "Time elapsed for whole multilevel: ", (end - start)
    #print "Features chosen: ", IDs[features]
    #print "Biomarkers chosen: ", NetCalc[features]
    #print "rfQ chosen: ", frQ[features]
    #print "Coefficients: ", coeff
    
    if verbose: 
        print "CROSS_VALIDATION ON ACTUAL DATA: "
        
    #CROSS-VALIDATION
    crossval_error, crossval_auc, crossval_Q, crossval_train_err, crossval_train_auc =\
    cross_validation(X=X, Y=Y, subj=subj, unique_subj=unique_subj, num_subj=num_subj, num_unique_subj=num_unique_subj, 
                     num_folds=num_folds, num_repeats=num_repeats, scaling=scaling, par=par, 
                     verbose=verbose, ddof=ddof, method=method, feature_selection=feature_selection, 
                     num_features = num_features, step=step, model_params=model_params, verbose_train=verbose_train)
    
    if verbose: 
        print "PERMUTATED DATA CROSS_VALIDATION: "
        
    #PERMUTATE
    permutation_error, permutation_auc, permutation_Q, permutation_train_error, permutation_train_auc =\
                                                        validate_permutation(X=X, Y=Y, subj=subj,
                                                                             unique_subj=unique_subj, 
                                                                             num_subj=num_subj,
                                                                             num_unique_subj=num_unique_subj,
                                                                             num_folds=num_folds,
                                                                             num_repeats=num_repeats, 
                                                                             scaling=scaling,
                                                                             num_permutations=num_permutations, 
                                                                             par=par, verbose=verbose, 
                                                                             ddof=ddof,method=method,
                                                                             feature_selection=feature_selection, 
                                                                             num_features=num_features, step=step,
                                                                             model_params=model_params, 
                                                                             verbose_train=verbose_train)
    
    results = {'num_folds':num_folds,'num_repeats':num_repeats,'scaling':scaling,'num_permutations':num_permutations,
               'par':par, 'filename':filename, 'verbose':verbose, 'ddof':ddof, 'method':method, 
               'feature_selection':feature_selection, 'num_features':num_features, 'step':step, 
               'n_trees':model_params['n_trees'], 'max_depth':model_params['max_depth'], 
               'max_features':model_params['max_features'], 'min_samples_leaf':model_params['min_samples_leaf'],
               'num_comp':model_params['n_comp'], 'full_train_error':full_train_error, 'full_train_auc':full_train_auc, 
               'crossval_error':crossval_error, 'crossval_auc':crossval_auc, 'crossval_Q':crossval_Q, 
               'crossval_train_err': crossval_train_err, 'crossval_train_auc':crossval_train_auc, 
               'permutation_error': permutation_error, 'permutation_auc':permutation_auc, 'permutation_Q':permutation_Q, 
               'C': model_params['C'], 'gamma': model_params['gamma'], 'epsilon': model_params['epsilon'], 
               'degree': model_params['degree'], 'kernel': model_params['kernel'] }
    
    return results

In [53]:
#CROSS-VALIDATION
def cross_validation(X, Y, subj, unique_subj, num_subj, num_unique_subj, num_folds, num_repeats, scaling, par, 
                     verbose, ddof, method, feature_selection, num_features, step, model_params,
                     verbose_train):
    #initialization
    error       = 0 
    auc         = 0
    train_error = 0
    train_auc   = 0
    
    #create array of labels for each subject
    Y_temp = [Y[np.where(subj==unique_subj[i])[0][0]] for i in range(num_unique_subj)]
    Y=np.array(Y)
    
    #repeat cross_validation as many times as specified
    for i in range(num_repeats): 
        kf = StratifiedKFold(n_splits=num_folds) #KFold cross validation that keeps the ratio between classes
        kf.get_n_splits(X, Y)
        
        for train_index, test_index in kf.split(X[:num_unique_subj], Y_temp):#repeat with every fold as test set
            train_subj = unique_subj[train_index]
            test_subj = unique_subj[test_index]
        
            X_train, X_test, Y_train, Y_test, subj_train, subj_test, num_train_subj, num_test_subj = \
                                                    separate_train_test(X=X, Y=Y, subj=subj, train_subj=train_subj,
                                                                        test_subj=test_subj, num_subj=num_subj)
                        
            num_unique_train_subj=len(train_subj)
            num_unique_test_subj=len(test_subj)
            
            X_train_mean = np.mean(X_train, axis=0)
            Y_train_mean = np.mean(Y_train, axis=0)
            
            features, model, train_error_temp, train_auc_temp, Xb_train, Xw_train, Y_scaled_train, X_scaled_train, coeff = \
                                       perform_multilevel_pls(X=X_train, Y=Y_train, subj=subj_train, 
                                                              unique_subj=train_subj, 
                                                              num_unique_subj=num_unique_train_subj, 
                                                              num_subj=num_train_subj, scaling=scaling, par=par, 
                                                              ddof=ddof, method=method, 
                                                              feature_selection=feature_selection, 
                                                              num_features=num_features, step=step,
                                                              model_params=model_params, 
                                                              verbose_train=verbose_train)

            X_centered_test = center_data(X_test, X_train_mean) #mean centering of data
            Y_centered_test = center_data(Y_test, Y_train_mean)   

            #split test data into between and within subject variation
            if par!= 'all':
                Xb_test, Xw_test = split_between_within_subject_matrix(X=X_centered_test, subj=subj_test, 
                                                                   unique_subj=test_subj,
                                                                   num_unique_subj=num_unique_test_subj, 
                                                                   num_subj=num_test_subj )

                
            if scaling: #scale the data, if the flag is true
                X_train_deviation = np.std(X_train, axis = 0, ddof=ddof)
                Y_train_deviation = np.std(Y_train, axis = 0, ddof=ddof)
                if par!= 'all':
                    Xb_train_deviation = np.std(Xb_train, axis = 0, ddof=ddof)
                    Xw_train_deviation = np.std(Xw_train, axis = 0, ddof=ddof)

                X_scaled_test = scale_data(matrix=X_centered_test, deviation=X_train_deviation, ddof=ddof)
                Y_scaled_test = scale_data(matrix=Y_centered_test, deviation=Y_train_deviation, ddof=ddof)
                if par!= 'all':
                    Xb_scaled_test = scale_data(matrix=Xb_test, deviation=Xb_train_deviation, ddof=ddof)
                    Xw_scaled_test = scale_data(matrix=Xw_test, deviation=Xw_train_deviation, ddof=ddof)
            else:

                X_scaled_test = X_centered_test
                Y_scaled_test = Y_centered_test
                if par!= 'all':
                    Xb_scaled_test = Xb_test
                    Xw_scaled_test = Xw_test

            #if only between or only within subject variations chosen, predict on that part of the matrix
            if par=='all':
                X_target = X_scaled_test
            elif par=='between':
                X_target = Xb_scaled_test
            elif par=='within':
                X_target = Xw_scaled_test
            X_target = X_target[:, features]
                     
            pred = model.predict(X_target) #predict test data with model trained on the training set
             
            #process the predicted values
            pred=pred.flatten()
            pred= pred.reshape(np.product(pred.shape),)
            pred = descale_data(matrix=pred, deviation=Y_train_deviation, ddof=ddof)
            pred = pred+Y_train_mean

            #compute the cross-validation cumulative error (squeared error)
            diff = np.array(Y_test - pred) 
            err_temp= sum(diff*diff)

            #get and plot ROC curve, if want
            fpr, tpr, auc_temp = get_roc_auc(labels=Y_test, predictions=pred)        
            if i%10==0 and verbose:
                plot_roc_curve(fpr=fpr, tpr=tpr, auc=auc_temp)
                
            auc = auc + auc_temp            
            error = error + err_temp #compute square error
            train_error =train_error+train_error_temp
            train_auc = train_auc+train_auc_temp
    
        
    error = float(error)/(num_repeats*num_subj) #mean error for cross-validation
    auc = auc/(num_repeats*num_folds)     #mean AUC score for cross validation
    train_error = float(train_error)/(num_repeats*num_subj)
    train_auc = float(train_auc)/(num_repeats*num_folds) 
    Q = 1 - error/(sum(Y*Y)/float(num_subj))
    if verbose:
        print "Mean cross-validation error: ", error
        print "Mean cross-validation AUC: ", auc
        print "Cross-validation Q: ", Q
    if verbose_train:
        print "Mean train cross-validation error: ", error
        print "Mean train cross-validation AUC: ", auc
        
    return error, auc, Q, train_error, train_auc

In [54]:
#split data matrix nito between and within subject variations
def split_between_within_subject_matrix(X, subj, unique_subj, num_unique_subj, num_subj): #SPLIT MATRIX 
    Xb = np.zeros(X.shape) #initialization
    Xw = np.zeros(X.shape)
    means = np.zeros((num_unique_subj, X.shape[1]))
    for j in range(num_unique_subj): #go through all unique subjects
        #find indexes of all entries for a certain subject (unique_subj[j])
        idx = np.where(np.array(subj)==unique_subj[j]) 
        #calculate mean for each subject unique_subj[j]      
        means[j] = np.mean(X[idx[0]], axis=0)     
    for i in range(num_subj): #go through subjects of all entries
        #find the index of the subject corresponding to subj[i] in unique_subj
        k = np.where(unique_subj==subj[i])
        #create a matrix where all entries for subject = mean (between subject variation) 
        Xb[i] = means[k[0][0]]                  
    Xw = X - Xb #get the within subjects matrix             
    return Xb, Xw

In [55]:
#PERMUTATION
def validate_permutation(X, Y, subj, unique_subj, num_subj, num_unique_subj, num_folds, num_repeats, scaling, 
                         num_permutations, par, verbose, ddof, method, feature_selection, num_features, 
                         step, model_params, verbose_train):
    
    err       = [] #initialization
    auc       = []
    Q         = []
    train_err = []
    train_auc = []
    
    for i in range(num_permutations):#perform permutations as many times as specified
        Y_temp = Y.copy()  #create temporary labels which then shuffle/permutate, so original is left untouched
        
        np.random.shuffle(Y_temp) #randomly shuffle the Y(labels) vector. Checks if your results will be similar
        
        #perform cross-validation for each permutation and get an array of accuracies when permutated
        err_temp, auc_temp, Q_temp, train_error_temp, train_auc_temp  = cross_validation(X=X, Y=Y_temp, subj=subj, 
                                                                            unique_subj=unique_subj,
                                                                            num_subj=num_subj, 
                                                                            num_unique_subj=num_unique_subj, 
                                                                            num_folds=num_folds, 
                                                                            num_repeats=num_repeats, 
                                                                            scaling=scaling, 
                                                                            par=par, 
                                                                            verbose=verbose,ddof=ddof,method=method, 
                                                                            feature_selection=feature_selection, 
                                                                            num_features=num_features, step=step, 
                                                                            model_params=model_params, 
                                                                            verbose_train=verbose_train)
        
        #save all the error and metrics values
        err.append(err_temp)
        auc.append(auc_temp)
        Q.append(Q_temp)
        train_err.append(train_err_temp)
        train_auc.append(train_auc_temp)
        
    if len(err) >0 and verbose :
        print "Mean permutated squared error : ", np.mean(err)
        print "Mean permutated auc : ", np.mean(auc)
        print "Mean permutated Q: ", np.mean(Q)
    elif verbose:
        print "No permutations performed"
    return err, auc, Q, train_err, train_auc

In [56]:
def read_data(file_name=None, mode='delta1'):
    if file_name is None: #if no input file specified, make up data
        num_subj = 30
        num_feat = 1000
        X_out = np.random.rand(num_subj, num_feat)
        for i in range(3*num_subj):
            if i%2==0:
                X[i, 0] = X[i, 0]+(np.random.rand()+0.75)*15
                X[i, 3] = X[i, 3]+(np.random.rand()+0.75)*20
                X[i, 6] = X[i, 6]+(np.random.rand()+0.75)*17
        Y =    [1 if i%2 == 0 else 0 for i in range(3*num_subj)]
        subjects = [1+i%num_subj for i in range(3*num_subj)]

    else:
        data = pd.read_excel(file_name) #read the excel file
        #take relevant values as subject id's, convert to int
        subjects = data.head().iloc[-2].values[0:148].astype(np.int64) 
        #take relevant values as labels, convert to int
        
        X = {}
        subjects = {}
        Y = {}
        
        #take the matabolite matrix, transpose, convert to int
        X['all'] = data.values[7:, :-9].transpose().astype(np.int64)  
        Y['all'] = (data.head().iloc[-3].values[0:148]=='risk').astype(np.int64)
        subjects['all'] = data.head().iloc[-2].values[0:148].astype(np.int64)
        
        unique_subj = np.unique(subjects['all']) 
        num_unique_subj = len(unique_subj)
        num_subj = len(subjects['all'])
        subject_time = {}
        mask_temp = np.array([subjects['all'][index+1]-subjects['all'][index] for index in range(num_subj-1)])
        lines = np.where(mask_temp<0)[0]
        X['delta1'] = []
        X['delta2'] = []
        X['delta3'] = []
        subjects['delta1'] = []
        subjects['delta2'] = []
        subjects['delta3'] = []
        Y['delta1'] = []
        Y['delta2'] = []
        Y['delta3'] = []
        for s in unique_subj:
            indeces_temp = np.where(subjects['all']==s)[0]
            subject_time[s]= {'t0':None, 't1':None, 't2':None}
            for el in indeces_temp:
                    if el>lines[1]:
                        subject_time[s]['t2']= el
                    elif el <= lines[0]:
                        subject_time[s]['t0']= el
                    else:
                        subject_time[s]['t1']= el
            if (subject_time[s]['t1'] is not None) and (subject_time[s]['t0'] is not None):
                X['delta1'].append((X['all'][subject_time[s]['t1']]-X['all'][subject_time[s]['t0']]).reshape(1, X['all'].shape[1]))
                subjects['delta1'].append(s)
                Y['delta1'].append(Y['all'][indeces_temp[0]])
            if (subject_time[s]['t2'] is not None) and (subject_time[s]['t1'] is not None):
                X['delta2'].append((X['all'][subject_time[s]['t2']]-X['all'][subject_time[s]['t1']]).reshape(1, X['all'].shape[1]))
                subjects['delta2'].append(s)
                Y['delta2'].append(Y['all'][indeces_temp[0]])
            if (subject_time[s]['t2'] is not None and subject_time[s]['t0'] is not None):
                X['delta3'].append((X['all'][subject_time[s]['t2']]-X['all'][subject_time[s]['t0']]).reshape(1, X['all'].shape[1]))
                subjects['delta3'].append(s)
                Y['delta3'].append(Y['all'][indeces_temp[0]])
        X['delta1'] = np.vstack( X['delta1'])
        X['delta2'] = np.vstack( X['delta2'])
        X['delta3'] = np.vstack( X['delta3'])
        
        import xlrd
 
        loc = (file_name)

        # To open Workbook
        wb = xlrd.open_workbook(loc)
        sheet = wb.sheet_by_index(0)

        # For row 0 and column 0
        IDs = np.array(sheet.col_values(3)[8:])
        NetCalc = np.array(sheet.col_values(6)[8:])
        frQ = np.array(sheet.col_values(9)[8:])
         
    return X[mode], np.array(Y[mode]), np.array(subjects[mode]), IDs, NetCalc, frQ

In [57]:
file_name = 'OGTT_INPUT.xlsx' #name of the file with data (with path, if not in the same folder)
matrix = 'all'          #which matrix to work with(depends on experiment setup)-'between','within','all', 'delta1', 'delta2', 'delta3'           
mean = 'delta1'
verbose = False                #if True, will plot ROC curves. Else, put False
verbose_train = False       #print AUC and error for training data
num_comp = 1                  #number of components for PLS
method = 'pls'                #which method to use for classifying. 'rf' or 'pls'

feature_selection = 'rfe'    #which method to use for feature selection 'rfe', 'rfecv', "FromModel" or None
num_features = 150           #how many features to select (if method has this parameter)
step = 0.3                  #how many features to discard in one iteration, if applicable

n_trees = 350 #how many trees there will be in the ensemble classifier Random Forest (if applicable)
max_depth = 2   #maximal depth of the decision trees in Random Forest
max_features = 1 #maximal number of features concidered at each split by the decision tree in Random Forest
min_samples_leaf=1 #minimal number of samples to be left after split (in a leaf) of decision tree in Random Forest

C = 1
epsilon = 1
kernel = 'linear'
gamma = 'auto'
degree=3

scaling = True               #whether to scale the data. True or False  
num_folds = 20              #How many folds for k-fold cross-validation
num_repeats = 1         #How many times to repeat k-fold cross-validation
num_permutations = 0         #How many permutations to make (sanity check for model, 
                              #if permutated=randomly shuffled data results are too similar - BAD )
ddof= 1  #parameter for computing standard deviation,relevant only if scaling = True. 

model_params = {}
model_params['pls'] = {}

model_params['pls']['C'] = C
model_params['pls']['epsilon'] = epsilon
model_params['pls']['kernel'] = kernel
model_params['pls']['gamma'] = gamma
model_params['pls']['degree'] = degree
model_params['pls']['n_trees'] = None
model_params['pls']['max_depth'] = None
model_params['pls']['max_features'] = None
model_params['pls']['min_samples_leaf'] = None
model_params['pls']['n_comp'] = 2

#res = full_script(num_folds,num_repeats,scaling, num_permutations, matrix, file_name, 
#                                           verbose, ddof, method, feature_selection, num_features, step, 
#                                           model_params[method], verbose_train)
#print "Results: ", res

matrix = 'all'
res = full_script(num_folds,num_repeats,scaling, num_permutations, matrix, file_name, 
                                           verbose, ddof, method, feature_selection, num_features, step, 
                                           model_params[method], verbose_train, mean)
#print "Results: ", res

Time elapsed for simple classifier:  95.6659998894
Time elapsed for whole multilevel:  95.743999958
Time elapsed for simple classifier:  97.8040001392
Time elapsed for simple classifier:  95.2589998245


ValueError: operands could not be broadcast together with shapes (44,) (45,) 

In [69]:
b = {'id':[i for i in range(8000)], 'delta1':[0.1 for i in range(8000)], 'delta2':[0.7 for i in range(8000)]}
print "Type: ", type(b['delta1'])
dest_file = "t_stuff.csv"
print "Type of values are: ", type(b.values())
with open(dest_file, 'w') as csvfile:
    writer = csv.writer(csvfile)
    #writer.writeheader()
    writer.writerow(b.keys())
    writer.writerows(zip(*b.values()))

Type:  <type 'list'>
Type of values are:  <type 'list'>


In [275]:
np.power(np.e, -1.5)

0.22313016014842985

In [None]:
arr1 = [27106,
105321,
126660,
115477,
105371,
141743,
111233,
124210,
143869,
123297,
145408,
98783,
110631,
116150,
144914,
114368,
55564,
108997,
151744,
61031,
144007,
130687,
70656,
143287,
111268,
143689,
133428,
116684,
56977,
167862,
114684,
176474,
38745,
113884,
118805,
101025,
99988,
77782,
137624,
133451,
146956,
146587,
147223,
151351,
147353,
53154,
157135,
131956,
117846,
109380,
148226,
114713,
111846,
105099,
41791,
143598,
160588,
160162,
149503,
111308,
135131,
121102,
99759,
118373,
140558,
133646,
150408,
144329,
147111,
25378,
26440,
122867,
81741,
120258,
133826,
114451,
123300,
156797,
143764,
173188,
72767,
140464,
133371,
55674,
101847,
46429,
158312,
148683,
117161,
102078,
107153,
165258,
134953,
134826,
117812,
161209,
172297,
97013,
138437,
113384,
64418,
101171,
133430,
129606,
69565,
144012,
147886,
106130
91957
78910
168924
132854
149525
142397
54469
81194
152638
136217
102071
109189
132690
84139
57622
134992
116388
46949
105895
126667
123808
175673
159126
159222
61572
112401
62597
66586
109893
112171
117133
144035
93876
101797
157654
156409
105832
172100
75611
122104
111856
76912
90322
83709
99724
132331
113328
108871
139402
148039
114194
49544
114268
106671
151593
103372
144395
166516
94848
41012
133631
126557
87247
143847
120961
83971
164989
133992
148861
125532
158277
81993
153432
140036
123016
152261
140235
115198
118088
93554
144322
141866
75234
116141
120674
89149
174883
117044
120459
153530
118013
184329
133810
91913
131636
161386
34067
116765
116969
95618
96311
70486
120259
169783
108399
116452
118150
104612
111945
115738
86401
150517
112685
151009
153224
115143
56201
106307
140721
86504
117583
176937
145919
122478
83231
137802
99860
94215
120669
158755
144820
78839
156017
146842
106519
156284
158804
98538
134658
125941
111905
91894
169965
145817
110192
132130
129183
133553
185709
114803
175066
170398
24102
122442
103707
157989
53126
161317
178875
114201
55040
72241
160226
133997
120639
101234
143145
148528
172174
118560
170927
97619
92065
94997
180769
89932
45822
123011
111652
117664
65937
118568
114213
144951
150151
120735
105372
23204
116613
75714
174279
119578
76415
86756
122054
129522
110757
104326
128750
41131
141850
165086
192530
122508
141075
159266
152181
27836
98225
147917
133929
173147
100418
180201
79004
139889
142471
151173
91437
154395
132134
124782
98351
152433
138484
145664
128156
156326
140495
88089
160577
179103
94873
152891
166110
136197
134480
109826
145249
156301
46468
112335
114199
133993
138644
103674
127758
98806
138190
135112
157557
122803
114526
134154
144127
126225
97056
120289
149273
143665
143395
93477
145623
133623
120761
85974
94895
103675
151299
120670
74704
86897
174979
96732
149030
100728
142786
138780
153667
141582
134644
124628
136238
141199
157153
136635
143274
147760
126393
108622
138061
33607
158654
165739
97581
125320
134358
123793
93456
123087
118349
123912
115801
173199
83147
112431
134281
84512
171466
160346
96777
134960
158015
108827
121614
99130
126381
96358
116874
103657
156972
73158
157541
98206
129885
114111
111272
142284
150296
153926
120293
160580
108345
125429
135723
118014
84867
121381
99833
124392
172528
106375
133737
102054
106468
126611
94999
151337
150153
160205
88722
15941
123046
124758
125737
168699
131804
131037
161732
136140
149649
29130
133438
83051
152526
112635
102982
137656
110274
129979
112437
92918
156122
38860
104914
88233
156253
95739
149189
63581
117109
100764
57571
136303
116092
135837
118641
109664
156294
93108
70020
136091
174794
93015
147427
140056
136815
117573
102537
116193
108904
123324
60576
129222
156926
111866
143546
145389
138660
89192
115231
118862
121029
163107
150305


In [None]:
a = [[10, 9, 11, 2, 5, 6, 7, 3, 0, 4],[10, 9, 11, 2, 5, 6, 7, 3, 0, 4],[10, 9, 11, 2, 5, 6, 7, 3, 0, 4]]
b = [1, 3, 2, 7]