In [21]:
#!/usr/bin/env python
# coding: utf-8

# In[ ]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeClassifier
import numpy as np
import os
import sys
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import itertools
#import other python scripts for further anlaysis
import classification
#import results
import warnings
warnings.filterwarnings("ignore")
# Initialization of directory information:
thisDir = os.path.expanduser('~/Desktop/MSC_Alexis/analysis/')
dataDir = thisDir + 'data/mvpa_data/'
outDir = thisDir + 'output/results/'
# Subjects and tasks
taskList=['glass','semantic', 'motor','mem']
subList=['MSC01','MSC02','MSC03','MSC04','MSC05','MSC06','MSC07','MSC10']
#all possible combinations of subs and tasks
subsComb=(list(itertools.permutations(subList, 2)))
tasksComb=(list(itertools.permutations(taskList, 2)))
#DS combination
DSvars=list(itertools.product(list(subsComb),list(taskList)))
##SS combination
SSvars=list(itertools.product(list(subList),list(tasksComb)))
#BS combination
BSvars=list(itertools.product(list(subsComb),list(tasksComb)))
#CV combination
CVvars=list(itertools.product(list(subList),list(taskList)))

"""
Each function declares the type of analysis you wanted to run. DS--different subject same task; SS--same subject different task; BS--different subject different task.
Each analysis will concatenate across subjects and make a dataframe.
"""
def classifyDS():
    """
    Classifying different subjects (DS) along the same task

    Parameters
    -------------


    Returns
    -------------
    dfDS : DataFrame
        Dataframe consisting of average accuracy across all subjects

    """
    acc_scores_per_task=[]
    sen_per_task=[]
    spec_per_task=[]
    tmp_df=pd.DataFrame(DSvars, columns=['sub','task'])
    dfDS=pd.DataFrame()
    dfDS[['train_sub','test_sub']]=pd.DataFrame(tmp_df['sub'].tolist())
    dfDS['task']=tmp_df['task']
    for index, row in dfDS.iterrows():
        total_score, total_sen, total_spec=model('DS', train_sub=row['train_sub'], test_sub=row['test_sub'], train_task=row['task'], test_task=row['task'])
        acc_scores_per_task.append(total_score)
        sen_per_task.append(total_sen)
        spec_per_task.append(total_spec)
    dfDS['acc']=acc_scores_per_task
    dfDS['spec']=spec_per_task
    dfDS['sen']=sen_per_task
    dfDS.to_csv(outDir+'acc/DS/acc.csv',index=False)

def classifySS():
    """
    Classifying the same subject (SS) along a different task

    Parameters
    -------------


    Returns
    -------------
    dfSS : DataFrame
        Dataframe consisting of average accuracy across all subjects

    """
    acc_scores_per_task=[]
    sen_per_task=[]
    spec_per_task=[]
    tmp_df=pd.DataFrame(SSvars, columns=['sub','task'])
    dfSS=pd.DataFrame()
    dfSS[['train_task','test_task']]=pd.DataFrame(tmp_df['task'].tolist())
    dfSS['sub']=tmp_df['sub']
    for index, row in dfSS.iterrows():
        total_score, total_sen, total_spec=model('SS', train_sub=row['sub'], test_sub=row['sub'], train_task=row['train_task'], test_task=row['test_task'])
        acc_scores_per_task.append(total_score)
        sen_per_task.append(total_sen)
        spec_per_task.append(total_spec)
    dfSS['acc']=acc_scores_per_task
    dfSS['spec']=spec_per_task
    dfSS['sen']=sen_per_task
    #save accuracy
    dfSS.to_csv(outDir+'acc/SS/test_acc.csv',index=False)
def classifyBS():
    """
    Classifying different subjects (BS) along different tasks

    Parameters
    -------------


    Returns
    -------------
    dfBS : DataFrame
        Dataframe consisting of average accuracy across all subjects

    """
    acc_scores_per_task=[]
    sen_per_task=[]
    spec_per_task=[]
    tmp_df=pd.DataFrame(BSvars, columns=['sub','task'])
    dfBS=pd.DataFrame()
    dfBS[['train_task','test_task']]=pd.DataFrame(tmp_df['task'].tolist())
    dfBS[['train_sub', 'test_sub']]=pd.DataFrame(tmp_df['sub'].tolist())
    for index, row in dfBS.iterrows():
        total_score, total_sen, total_spec=model('BS', train_sub=row['train_sub'], test_sub=row['test_sub'], train_task=row['train_task'], test_task=row['test_task'])
        acc_scores_per_task.append(total_score)
        sen_per_task.append(total_sen)
        spec_per_task.append(total_spec)
    dfBS['acc']=acc_scores_per_task
    dfBS['spec']=spec_per_task
    dfBS['sen']=sen_per_task
    #save accuracy
    dfBS.to_csv(outDir+'acc/BS/acc.csv',index=False)
def classifyCV():
    """
    Classifying same subjects (CV) along the same task

    Parameters
    -------------

    Returns
    -------------
    dfCV : DataFrame
        Dataframe consisting of average accuracy across all subjects

    """
    dfCV=pd.DataFrame(CVvars, columns=['sub','task'])
    clf=RidgeClassifier()
    acc_scores_per_task=[]
    CVspec=[]
    CVsen=[]
    for index, row in dfCV.iterrows():
        taskFC=reshape.matFiles(dataDir+row['task']+'/'+row['sub']+'_parcel_corrmat.mat')
        restFC=reshape.matFiles(dataDir+'rest/'+row['sub']+'_parcel_corrmat.mat')
        folds=taskFC.shape[0]
        x_train, y_train=reshape.concateFC(taskFC, restFC)
        CVscores=cross_val_score(clf, x_train, y_train, cv=folds)
        #Get sensitivity/specificity measures
        y_pred=cross_val_predict(clf, x_train, y_train,cv=folds)
        tn, fp, fn, tp=confusion_matrix(y_train, y_pred).ravel()
        CV_specificity= tn/(tn+fp)
        CV_sensitivity= tp/(tp+fn)
        #get accuracy
        mu=CVscores.mean()
        acc_scores_per_task.append(mu) 
        CVspec.append(CV_specificity)
        CVsen.append(CV_sensitivity)
        #Get specificity/sensitivity measures
    #average acc per sub per tasks
    dfCV['acc']=acc_scores_per_task
    dfCV['spec']=CVspec
    dfCV['sen']=CVsen
    dfCV.to_csv(outDir+'acc/CV/acc.csv', index=False)
def model(analysis, train_sub, test_sub, train_task, test_task):
    """
    Preparing machine learning model with appropriate data

    Parameters
    -------------
    analysis : string
            The type of analysis to be conducted
    train_sub : str
            Subject name for training
    test_sub : str
            Subject name for testing
    train_task : str
            Task name for training
    test_task : str
            Task name for testing

    Returns
    -------------
    total_score : float
            Average accuracy of all folds

    """

    clf=RidgeClassifier()
    df=pd.DataFrame()
    taskFC=classification.reshape.matFiles(dataDir+train_task+'/'+train_sub+'_parcel_corrmat.mat')
    #if your subs are the same SS analysis use rest split up into 20
    if train_sub==test_sub:
        restFC=classification.reshape.matFiles(dataDir+'rest/corrmats_timesplit/half/'+train_sub+'_parcel_corrmat.mat')
        restFC, test_restFC=train_test_split(restFC, test_size=.5)
        test_taskFC=classification.reshape.matFiles(dataDir+test_task+'/'+test_sub+'_parcel_corrmat.mat')
        total_score, total_sen, total_spec=CV_folds(clf, analysis, taskFC, restFC, test_taskFC, test_restFC)

    else:
        restFC=classification.reshape.matFiles(dataDir+'rest/'+train_sub+'_parcel_corrmat.mat')
        test_taskFC=classification.reshape.matFiles(dataDir+test_task+'/'+test_sub+'_parcel_corrmat.mat')
        test_restFC=classification.reshape.matFiles(dataDir+'rest/'+test_sub+'_parcel_corrmat.mat')
        total_score, total_sen, total_spec=CV_folds(clf, analysis, taskFC, restFC, test_taskFC, test_restFC)
    return total_score, total_sen, total_spec

def CV_folds(clf, analysis, taskFC, restFC, test_taskFC, test_restFC):
    """
    Cross validation to train and test using nested loops

    Parameters
    -----------
    clf : obj
        Machine learning algorithm
    analysis : str
        Analysis type
    taskFC, restFC, test_taskFC, test_restFC : array_like
        Input arrays, training and testing set of task and rest FC
    Returns
    -----------
    total_score : float
        Average accuracy across folds
    acc_score : list
        List of accuracy for each outer fold
    """

    loo = LeaveOneOut()
    taskSize=taskFC.shape[0]
    restSize=restFC.shape[0]
    t = np.ones(taskSize, dtype = int)
    r=np.zeros(restSize, dtype=int)
    """
    if analysis=='SS':
        df=pd.DataFrame()
        acc_score=[]
        spec_score=[]
        sen_score=[]
        for train_index, test_index in loo.split(taskFC):
            Xtrain_rest, Xtest_rest=restFC[train_index], restFC[test_index]
            Xtrain_task=taskFC[train_index]
            ytrain_rest=r[train_index]
            ytrain_task=t[train_index]
            X_tr=np.concatenate((Xtrain_task, Xtrain_rest))
            y_tr = np.concatenate((ytrain_task,ytrain_rest))
            clf.fit(X_tr,y_tr)
            tmpdf=pd.DataFrame()
            acc_scores_per_fold=[]
            #sensitivity and specificity per fold
            sen_per_fold=[]
            spec_per_fold=[]
            for t_index, te_index in loo.split(test_taskFC):
                Xtest_task=test_taskFC[te_index]
                X_Test = np.concatenate((Xtest_task, Xtest_rest))
                y_Test = np.array([1, 0])
                #test set
                y_pred=clf.predict(X_Test)
                #calculate sensitivity/specificity
                tn, fp, fn, tp=confusion_matrix(y_Test, y_pred).ravel()
                specificity= tn/(tn+fp)
                sensitivity= tp/(tp+fn)
                #Get accuracy of model
                ACCscores=clf.score(X_Test,y_Test)
                acc_scores_per_fold.append(ACCscores)
                sen_per_fold.append(sensitivity)
                spec_per_fold.append(specificity)
            tmpdf['inner_fold']=acc_scores_per_fold
            score=tmpdf['inner_fold'].mean()
            tmpdf['inner_sens']=sen_per_fold
            sens=tmpdf['inner_sens'].mean()
            tmpdf['inner_spec']=spec_per_fold
            spec=tmpdf['inner_spec'].mean()
            acc_score.append(score)
            spec_score.append(spec)
            sen_score.append(sens)
        df['outer_fold']=acc_score
        total_score=df['outer_fold'].mean()
        df['outer_sens']=sen_score
        total_sen=df['outer_sens'].mean()
        df['outer_spec']=spec_score
        total_spec=df['outer_spec'].mean()
    else:
    """
    df=pd.DataFrame()
    acc_score=[]
    spec_score=[]
    sen_score=[]
    #fold each training set
    for train_index, test_index in loo.split(taskFC):
        Xtrain_rest=restFC[train_index]
        Xtrain_task=taskFC[train_index]
        ytrain_rest=r[train_index]
        ytrain_task=t[train_index]
        X_tr=np.concatenate((Xtrain_task, Xtrain_rest))
        y_tr = np.concatenate((ytrain_task,ytrain_rest))
        clf.fit(X_tr,y_tr)
        tmpdf=pd.DataFrame()
        acc_scores_per_fold=[]
        #sensitivity and specificity per fold
        sen_per_fold=[]
        spec_per_fold=[]
        #fold each testing set
        for t_index, te_index in loo.split(test_taskFC):
            Xtest_rest=test_restFC[te_index]
            Xtest_task=test_taskFC[te_index]
            X_te=np.concatenate((Xtest_task, Xtest_rest))
            y_te=np.array([1, 0])
            #test set
            y_pre=clf.predict(X_te)
            #calculate sensitivity/specificity
            tn, fp, fn, tp=confusion_matrix(y_te, y_pre).ravel()
            specificity= tn/(tn+fp)
            sensitivity= tp/(tp+fn)
            #Get accuracy of model
            ACCscores=clf.score(X_te,y_te)
            acc_scores_per_fold.append(ACCscores)
            sen_per_fold.append(sensitivity)
            spec_per_fold.append(specificity) 
        tmpdf['inner_sens']=sen_per_fold
        sens=tmpdf['inner_sens'].mean()
        tmpdf['inner_spec']=spec_per_fold
        spec=tmpdf['inner_spec'].mean()
        tmpdf['inner_fold']=acc_scores_per_fold
        score=tmpdf['inner_fold'].mean()
        acc_score.append(score)
        spec_score.append(spec)
        sen_score.append(sens)
    df['outer_fold']=acc_score
    total_score=df['outer_fold'].mean()
    df['outer_sens']=sen_score
    total_sen=df['outer_sens'].mean()
    df['outer_spec']=spec_score
    total_spec=df['outer_spec'].mean()
    return total_score, total_sen, total_spec

def classifyAll():
    """
    Classifying different subjects along available data rest split into 40 samples to match with task

    Parameters
    -------------

    Returns
    -------------
    df : DataFrame
        Dataframe consisting of average accuracy across all subjects

    """
    acc_scores_per_sub=[]
    sen_scores_per_sub=[]
    spec_scores_per_sub=[]
    acc_scores_cv=[]
    sen_scores_cv=[]
    spec_scores_cv=[]
    df=pd.DataFrame(subsComb, columns=['train_sub','test_sub'])
    for index, row in df.iterrows():
        diff_score, same_score, CV_sens_score, CV_spec_score, DS_sens_score, DS_spec_score=modelAll(train_sub=row['train_sub'], test_sub=row['test_sub'])
        acc_scores_per_sub.append(diff_score)
        acc_scores_cv.append(same_score)
        sen_scores_cv.append(CV_sens_score)
        spec_scores_cv.append(CV_spec_score)
        sen_scores_per_sub.append(DS_sens_score)
        spec_scores_per_sub.append(DS_spec_score)
    df['cv_acc']=acc_scores_cv
    df['cv_sen']=sen_scores_cv
    df['cv_spec']=spec_scores_cv
    df['acc']=acc_scores_per_sub
    df['ds_sen']=sen_scores_per_sub
    df['ds_spec']=spec_scores_per_sub
    df.to_csv(outDir+'acc/ALL/precision_acc.csv',index=False)
    
def modelAll(train_sub, test_sub):
    """
    Preparing machine learning model with appropriate data

    Parameters
    -------------
    train_sub : str
            Subject name for training
    test_sub : str
            Subject name for testing

    Returns
    -------------
    total_score : float
            Average accuracy of all folds

    """

    clf=RidgeClassifier() 
    df=pd.DataFrame()
    #train sub
    memFC=reshape.matFiles(dataDir+'mem/'+train_sub+'_parcel_corrmat.mat')
    semFC=reshape.matFiles(dataDir+'semantic/'+train_sub+'_parcel_corrmat.mat')
    glassFC=reshape.matFiles(dataDir+'glass/'+train_sub+'_parcel_corrmat.mat')
    motFC=reshape.matFiles(dataDir+'motor/'+train_sub+'_parcel_corrmat.mat')
    restFC=reshape.matFiles(dataDir+'rest/corrmats_timesplit/fourths/'+train_sub+'_parcel_corrmat.mat')   
    taskFC=np.concatenate((memFC,semFC,glassFC,motFC))
    #test sub
    test_memFC=reshape.matFiles(dataDir+'mem/'+test_sub+'_parcel_corrmat.mat')
    test_semFC=reshape.matFiles(dataDir+'semantic/'+test_sub+'_parcel_corrmat.mat')
    test_glassFC=reshape.matFiles(dataDir+'glass/'+test_sub+'_parcel_corrmat.mat')
    test_motFC=reshape.matFiles(dataDir+'motor/'+test_sub+'_parcel_corrmat.mat')
    test_restFC=reshape.matFiles(dataDir+'rest/corrmats_timesplit/fourths/'+test_sub+'_parcel_corrmat.mat')
    test_taskFC=np.concatenate((test_memFC,test_semFC,test_glassFC,test_motFC))  
    diff_score, same_score,CV_sens_score, CV_spec_score, DS_sens_score, DS_spec_score=K_folds(train_sub, clf, taskFC, restFC, test_taskFC, test_restFC)
    return diff_score, same_score, CV_sens_score, CV_spec_score, DS_sens_score, DS_spec_score

def K_folds(train_sub, clf, taskFC, restFC, test_taskFC, test_restFC):
    """
    Cross validation to train and test using nested loops

    Parameters
    -----------
    clf : obj
        Machine learning algorithm
    taskFC, restFC, test_taskFC, test_restFC : array_like
        Input arrays, training and testing set of task and rest FC
    Returns
    -----------
    total_score : float
        Average accuracy across folds
    acc_score : list
        List of accuracy for each outer fold
    """

    kf = KFold(n_splits=5)
    taskSize=taskFC.shape[0]
    restSize=restFC.shape[0]
    t = np.ones(taskSize, dtype = int)
    r=np.zeros(restSize, dtype=int) 
    test_taskSize=test_taskFC.shape[0]
    test_restSize=test_restFC.shape[0]
    testT= np.ones(test_taskSize, dtype = int)
    testR= np.zeros(test_restSize, dtype = int)
    CVacc=[]
    CVspec=[]
    CVsen=[]
    df=pd.DataFrame()
    acc_score=[]
    DSspec=[]
    DSsen=[]
    #fold each training set
    for train_index, test_index in kf.split(taskFC):
        Xtrain_rest, Xval_rest=restFC[train_index], restFC[test_index]
        Xtrain_task, Xval_task=taskFC[train_index], taskFC[test_index]
        ytrain_rest, yval_rest=r[train_index], r[test_index] 
        ytrain_task, yval_task=t[train_index], t[test_index]
        X_tr=np.concatenate((Xtrain_task, Xtrain_rest))
        X_val=np.concatenate((Xval_task, Xval_rest))
        y_tr = np.concatenate((ytrain_task,ytrain_rest))
        y_val=np.concatenate((yval_task, yval_rest))
        clf.fit(X_tr,y_tr)
        #cross validation
        y_pred=clf.predict(X_val)
        #Test labels and predicted labels to calculate sensitivity specificity
        tn, fp, fn, tp=confusion_matrix(y_val, y_pred).ravel()
        CV_specificity= tn/(tn+fp)
        CV_sensitivity= tp/(tp+fn)
        #get accuracy
        CV_score=clf.score(X_val, y_val)
        CVacc.append(CV_score)
        CVspec.append(CV_specificity)
        CVsen.append(CV_sensitivity)
        tmpdf=pd.DataFrame()
        acc_scores_per_fold=[]
        sen_scores_per_fold=[]
        spec_scores_per_fold=[]
        #fold each testing set
        for t_index, te_index in kf.split(test_taskFC):
            Xtest_rest=test_restFC[te_index]
            Xtest_task=test_taskFC[te_index]
            X_te=np.concatenate((Xtest_task, Xtest_rest))
            ytest_task=testT[te_index]
            ytest_rest=testR[te_index]
            y_te=np.concatenate((ytest_task, ytest_rest))
            #test set
            y_pred_testset=clf.predict(X_te)
            #Test labels and predicted labels to calculate sensitivity specificity
            DStn, DSfp, DSfn, DStp=confusion_matrix(y_te, y_pred_testset).ravel()
            DS_specificity= DStn/(DStn+DSfp)
            DS_sensitivity= DStp/(DStp+DSfn)
            #Get accuracy of model
            ACCscores=clf.score(X_te,y_te)
            acc_scores_per_fold.append(ACCscores)
            sen_scores_per_fold.append(DS_sensitivity)
            spec_scores_per_fold.append(DS_specificity)
        tmpdf['inner_fold']=acc_scores_per_fold
        tmpdf['DS_sen']=sen_scores_per_fold
        tmpdf['DS_spec']=spec_scores_per_fold
        score=tmpdf['inner_fold'].mean()
        sen=tmpdf['DS_sen'].mean()
        spec=tmpdf['DS_spec'].mean()
        acc_score.append(score)
        DSspec.append(spec)
        DSsen.append(sen)
    df['cv']=CVacc
    df['CV_sen']=CVsen
    df['CV_spec']=CVspec
    #Different sub outer acc
    df['outer_fold']=acc_score
    df['DS_sen']=DSsen
    df['DS_spec']=DSspec
    same_sub_score=df['cv'].mean()
    diff_sub_score=df['outer_fold'].mean()
    CV_sens_score=df['CV_sen'].mean()
    CV_spec_score=df['CV_spec'].mean()
    DS_sens_score=df['DS_sen'].mean()
    DS_spec_score=df['DS_spec'].mean()
    return diff_sub_score, same_sub_score, CV_sens_score, CV_spec_score, DS_sens_score, DS_spec_score


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeClassifier
import numpy as np
import os
import sys
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import itertools
#import other python scripts for further anlaysis
import classification
#import results
import warnings
warnings.filterwarnings("ignore")
# Initialization of directory information:
thisDir = os.path.expanduser('~/Desktop/MSC_Alexis/analysis/')
dataDir = thisDir + 'data/mvpa_data/'
outDir = thisDir + 'output/results/'
restFC=classification.reshape.matFiles(dataDir+'rest/corrmats_timesplit/half/MSC01_parcel_corrmat.mat')   
restFC, test_restFC=train_test_split(restFC, test_size=.5)

In [18]:
test_restFC.shape

(10, 55278)

In [22]:
classifySS()