In [5]:
import classification
import os
import sys
import pandas as pd
import numpy as np
import itertools
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import KFold
subList=['MSC01','MSC02','MSC03','MSC04','MSC05','MSC06','MSC07','MSC10']
thisDir = os.path.expanduser('~/Desktop/MSC_Alexis/analysis/')
dataDir = thisDir + 'data/mvpa_data/'
outDir = thisDir + 'output/mL/'
subsComb=(list(itertools.permutations(subList, 2)))
taskList=['mixed', 'motor','mem']
DSvars=list(itertools.product(list(subsComb),list(taskList)))

In [6]:
def classifyAll():
    """
    Classifying different subjects along available data rest split into 30 samples to match with task

    Parameters
    -------------

    Returns
    -------------
    df : DataFrame
        Dataframe consisting of average accuracy across all subjects

    """
    #comparison of days analysis
    allDay=pd.DataFrame()
    days=30
    while days>3:
        idx=np.random.randint(30, size=(days))
        acc_scores_per_sub=[]
        acc_scores_cv=[]
        tmpdf=pd.DataFrame(subsComb, columns=['train_sub','test_sub'])
        for index, row in tmpdf.iterrows():
            diff_score, same_score=model(idx, train_sub=row['train_sub'], test_sub=row['test_sub'])
            acc_scores_per_sub.append(diff_score)
            acc_scores_cv.append(same_score)
        tmpdf['Within']=acc_scores_cv
        tmpdf['Between']=acc_scores_per_sub
        tmpdf['Days']=days
        allDay=pd.concat([allDay, tmpdf])
        days=days-1
    #allDay.to_csv(outDir+'results/ridge/acc/ALL/days/acc.csv',index=False)
    return allDay
    
def model(idx, train_sub, test_sub):
    """
    Preparing machine learning model with appropriate data

    Parameters
    -------------
    train_sub : str
            Subject name for training
    test_sub : str
            Subject name for testing

    Returns
    -------------
    total_score : float
            Average accuracy of all folds

    """

    clf=RidgeClassifier()
    
    #df=pd.DataFrame()
    #train sub
    memFC=classification.matFiles(dataDir+'mem/'+train_sub+'_parcel_corrmat.mat')
    mixFC=classification.matFiles(dataDir+'mixed/'+train_sub+'_parcel_corrmat.mat')
    motFC=classification.matFiles(dataDir+'motor/'+train_sub+'_parcel_corrmat.mat')
    restFC=classification.matFiles(dataDir+'rest/corrmats_timesplit/thirds/'+train_sub+'_parcel_corrmat.mat')
    
    taskFC=np.concatenate((memFC,mixFC,motFC))
    #test sub
    test_memFC=classification.matFiles(dataDir+'mem/'+test_sub+'_parcel_corrmat.mat')
    test_mixFC=classification.matFiles(dataDir+'mixed/'+test_sub+'_parcel_corrmat.mat')
    test_motFC=classification.matFiles(dataDir+'motor/'+test_sub+'_parcel_corrmat.mat')
    test_restFC=classification.matFiles(dataDir+'rest/corrmats_timesplit/thirds/'+test_sub+'_parcel_corrmat.mat')
    
    test_taskFC=np.concatenate((test_memFC,test_mixFC,test_motFC))
    
        
    diff_score, same_score=CV_folds(idx,train_sub, clf, taskFC, restFC, test_taskFC, test_restFC)
    #df['acc']=acc_score
    #df['train_sub']=train_sub
    #df['test_sub']=test_sub
    #df.to_csv(outDir+'results/ridge/acc/ALL/folds/'+train_sub+test_sub+'.csv',index=False)
    
    return diff_score, same_score

def CV_folds(idx, train_sub, clf, taskFC, restFC, test_taskFC, test_restFC):
    """
    Cross validation to train and test using nested loops

    Parameters
    -----------
    clf : obj
        Machine learning algorithm
    taskFC, restFC, test_taskFC, test_restFC : array_like
        Input arrays, training and testing set of task and rest FC
    Returns
    -----------
    total_score : float
        Average accuracy across folds
    acc_score : list
        List of accuracy for each outer fold
    """

    kf = KFold(n_splits=5)
    
    taskSize=taskFC.shape[0]
    restSize=restFC.shape[0]
    t = np.ones(taskSize, dtype = int)
    r=np.zeros(restSize, dtype=int)
    
    task_X=taskFC[idx][:]
    rest_X=restFC[idx][:]
    
    task_y=t[idx]
    rest_y=r[idx]
    
    X=np.concatenate((task_X,rest_X))
    Y=np.concatenate((task_y,rest_y))
    
    #manipulate number days in the analysis
    #X=X[idx][:]
    #Y=Y[idx]
    
    test_taskSize=test_taskFC.shape[0]
    test_restSize=test_restFC.shape[0]
    testT= np.ones(test_taskSize, dtype = int)
    testR= np.zeros(test_restSize, dtype = int)
    CVacc=[]
    CVdf=pd.DataFrame()
    df=pd.DataFrame()
    acc_score=[]
    #fold each training set
    for train_index, test_index in kf.split(X):
        #Xtrain_rest, Xval_rest=restFC[train_index], restFC[test_index]
        #Xtrain_task, Xval_task=taskFC[train_index], taskFC[test_index]
        #ytrain_rest, yval_rest=r[train_index], r[test_index] 
        #ytrain_task, yval_task=t[train_index], t[test_index]
        #X_tr=np.concatenate((Xtrain_task, Xtrain_rest))
        #X_val=np.concatenate((Xval_task, Xval_rest))
        #y_tr = np.concatenate((ytrain_task,ytrain_rest))
        #y_val=np.concatenate((yval_task, yval_rest))
        
        X_tr,X_val=X[train_index],X[test_index]
        y_tr, y_val=Y[train_index],Y[test_index]
        
        clf.fit(X_tr,y_tr)
        #cross validation
        clf.predict(X_val)
        #get accuracy
        CV_score=clf.score(X_val, y_val)
        CVacc.append(CV_score)
        tmpdf=pd.DataFrame()
        acc_scores_per_fold=[]
        #fold each testing set
        for t_index, te_index in kf.split(test_taskFC):
            Xtest_rest=test_restFC[te_index]
            Xtest_task=test_taskFC[te_index]
            X_te=np.concatenate((Xtest_task, Xtest_rest))
            
            ytest_task=testT[te_index]
            ytest_rest=testR[te_index]
            y_te=np.concatenate((ytest_task, ytest_rest))
            #test set
            clf.predict(X_te)
            #Get accuracy of model
            ACCscores=clf.score(X_te,y_te)
            acc_scores_per_fold.append(ACCscores)
        tmpdf['inner_fold']=acc_scores_per_fold
        score=tmpdf['inner_fold'].mean()
        acc_score.append(score)
    CVdf['acc']=CVacc
    df['cv']=CVacc
    df['outer_fold']=acc_score
    same_sub_score=df['cv'].mean()
    diff_sub_score=df['outer_fold'].mean()
    #CVdf.to_csv(outDir+'results/ridge/acc/ALL/folds/'+train_sub+'.csv',index=False)
    return diff_sub_score, same_sub_score


In [None]:
#run through 1000 times 
thousand_iter=pd.DataFrame()
for i in range(1000):
    df=classifyAll()
    thousand_iter=pd.concat([thousand_iter, df])
thousand_iter.to_csv(outDir+'results/ridge/acc/ALL/days/thousand_acc.csv',index=False)