# Stacked Machine Learning

In [1]:
## IMPORTANT !

# In the first order need to set the number of CPU 
# for calculation before launching (depends on computer's number of cores)
n_jobs= 50

### Load libraries

In [2]:
#libraries
import pandas as pd
import numpy as np
import os
import sys
import shutil
import glob
import joblib
import warnings
from datetime import date, datetime

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr
import scipy.stats as st

from nilearn import image as nli
from nilearn import plotting

from mne.viz import plot_connectivity_circle

### Load functions

In [3]:
def control_2(z, control, index):    #age+gender+race/ethn == 4
    #z should be a series
    #control is a feature table
    #index for indexing
    
    #shrink data to local train index
    y = z.reindex(index = index)
    X = control.reindex(index = index)

    #drop Nan in target and clean this subj from features
    y = y.dropna()
    X = X.loc[y.index,:]
    ind_y = np.array(y.index)
    
    #Centralize target by y_i-y_mean
    y= pd.DataFrame([i-y.mean() for i in y], index=y.index)    
    #y_real = y
    
    #reshaping data
    X = X.values
    y = y.values.reshape(-1, 1).ravel()
    
    #fill Nan in X
    X = SimpleImputer(strategy='mean').fit_transform(X)
    
    #Standartize X
    X = StandardScaler().fit_transform(X)
    
    #Fit to the training set
    y_pred = LinearRegression().fit(X, y).predict(X)
    
    y_res = y - y_pred
    
    return y_res, ind_y

In [4]:
def control_mov_feature(z, control, mov, index): #age+gender+race/ethn+each specific task movement == 5
    #z should be a table of features
    #mov should be a series with movements for a specific modality
    
    #shrink data to local train index
    z = z.reindex(index = index)
    control = control.reindex(index = index)
    mov = mov.reindex(index = index) 
    ind = z.index
    #concal control with mov
    cont = control
    cont['mov'] = mov
    
    #loop
    dct = {}
    col_name = z.columns
    for col in col_name:
        y = z[col]
        X = cont
        
        #Centralize target by y_i-y_mean
        y= pd.DataFrame([i-y.mean() for i in y], index=y.index) 
        
        #reshaping data
        X = X.values
        y = y.values.reshape(-1, 1).ravel()

        #fill Nan in X
        X = SimpleImputer(strategy='mean').fit_transform(X)

        #Standartize X
        X = StandardScaler().fit_transform(X)

        #Fit to the training set
        y_pred = LinearRegression().fit(X, y).predict(X)

        y_res = y - y_pred
        
        dct[col] = y_res
    
    df_t = pd.DataFrame(dct, index = ind)
    
    return df_t

### Path to the tables folder

In [7]:
path= '/media/DataD800/Alina/retest_set/MLtables/' 
path_s1200 = '/media/DataD800/Alina/main_set/MLtables/'

### Load tables

In [8]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

#demography
demo_retest_test2 = pd.read_csv(path+'demographics_table_new.csv', index_col=0)

#targets table
targ_retest_test2 = pd.read_csv(path+'cognition_table.csv', index_col=0)

#features tables as dictionary
features_retest_test2 = {
    'emo':pd.read_csv(path+'emo_table.csv', index_col=0),
    'gam':pd.read_csv(path+'gam_table.csv', index_col=0),
    'lan':pd.read_csv(path+'lan_table.csv', index_col=0),
    'mot':pd.read_csv(path+'mot_table.csv', index_col=0),
    'rel':pd.read_csv(path+'rel_table.csv', index_col=0),
    'soc':pd.read_csv(path+'soc_table.csv', index_col=0),
    'wm':pd.read_csv(path+'wm_table.csv', index_col=0),
    'cort':pd.read_csv(path+'cort_table.csv', index_col=0),
    'subc':pd.read_csv(path+'subc_table.csv', index_col=0),
    'surf':pd.read_csv(path+'surf_table.csv', index_col=0),
    'rest':pd.read_csv(path+'rest_table_featfiltered.csv', index_col=0),
    'VolBrain':pd.read_csv(path+'VolBrain_table.csv', index_col=0)
}


#table with movements (mean relative displacement Movement_RelativeRMS_mean.txt)
movements_retest_test2 = pd.read_csv(path+'movement_table.csv', index_col=0)

#create tables with 2 controling parameters: gender and age
sex_coded_retest_test2 = pd.Series(LabelEncoder().fit_transform(demo_retest_test2.loc[:,['Gender']]), index=demo_retest_test2.index, name='Gender')
control_retest_test2 = pd.concat([sex_coded_retest_test2, demo_retest_test2.loc[:, ['Age_in_Yrs']]], axis=1)

In [9]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

#demography
demo_main = pd.read_csv(path_s1200+'demographics_table_new.csv', index_col=0)

#targets table
targ_main = pd.read_csv(path_s1200+'cognition_table.csv', index_col=0)

#features tables as dictionary
features_main = {
    'emo':pd.read_csv(path_s1200+'emo_table.csv', index_col=0),
    'gam':pd.read_csv(path_s1200+'gam_table.csv', index_col=0),
    'lan':pd.read_csv(path_s1200+'lan_table.csv', index_col=0),
    'mot':pd.read_csv(path_s1200+'mot_table.csv', index_col=0),
    'rel':pd.read_csv(path_s1200+'rel_table.csv', index_col=0),
    'soc':pd.read_csv(path_s1200+'soc_table.csv', index_col=0),
    'wm':pd.read_csv(path_s1200+'wm_table.csv', index_col=0),
    'cort':pd.read_csv(path_s1200+'cort_table.csv', index_col=0),
    'subc':pd.read_csv(path_s1200+'subc_table.csv', index_col=0),
    'surf':pd.read_csv(path_s1200+'surf_table.csv', index_col=0),
    'rest':pd.read_csv(path_s1200+'rest_table_featfiltered.csv', index_col=0),
    'VolBrain':pd.read_csv(path_s1200+'VolBrain_table.csv', index_col=0)
}


#table with movements (mean relative displacement Movement_RelativeRMS_mean.txt)
movements_main = pd.read_csv(path_s1200+'movement_table.csv', index_col=0)

#create tables with 2 controling parameters: gender and age
sex_coded_main = pd.Series(LabelEncoder().fit_transform(demo_main.loc[:,['Gender']]), index=demo_main.index, name='Gender')
control_main = pd.concat([sex_coded_main, demo_main.loc[:, ['Age_in_Yrs']]], axis=1)

In [10]:
#shrink tables to same subj numers
yy = targ_main['CogTotalComp_Unadj'].dropna()

demo_main = demo_main.reindex(index=yy.index)
movements_main = movements_main.reindex(index=yy.index)
control_main = control_main.reindex(index=yy.index)

for key in features_main.keys():
    features_main[key] = features_main[key].reindex(index=yy.index)

targ_main = targ_main.reindex(index=yy.index)

In [11]:
#shrink tables to same subj numers
yy1 = targ_retest_test2['CogTotalComp_Unadj'].dropna()

demo_retest_test2 = demo_retest_test2.reindex(index=yy1.index)
movements_retest_test2 = movements_retest_test2.reindex(index=yy1.index)
control_retest_test2 = control_retest_test2.reindex(index=yy1.index)

for key in features_retest_test2.keys():
    features_retest_test2[key] = features_retest_test2[key].reindex(index=yy1.index)

targ_retest_test2 = targ_retest_test2.reindex(index=yy1.index)

In [12]:
#spliting main ito train and retest_test1

demo_retest_test1 = demo_main.loc[demo_retest_test2.index,:]
targ_retest_test1 = targ_main.loc[demo_retest_test2.index,:]
movements_retest_test1 = movements_main.loc[demo_retest_test2.index,:]
control_retest_test1 = control_main.loc[demo_retest_test2.index,:]
features_retest_test1 = {}
for key in features_main.keys():
    features_retest_test1[key] = features_main[key].loc[demo_retest_test2.index,:]


demo_train = demo_main.drop(demo_retest_test2.index, axis=0)
targ_train = targ_main.drop(demo_retest_test2.index, axis=0)
movements_train = movements_main.drop(demo_retest_test2.index, axis=0)
control_train = control_main.drop(demo_retest_test2.index, axis=0)
features_train = {}
for key in features_main.keys():
    features_train[key] = features_main[key].drop(demo_retest_test2.index, axis=0)



In [13]:
control_train.shape

(839, 2)

In [14]:
control_retest_test1.shape

(34, 2)

In [15]:
control_retest_test2.shape

(34, 2)

##### Leave-P-groups out based on 8-Fold CV

In [16]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


#for col in targ.columns:
col = 'CogTotalComp_Unadj'  
y_train = targ_train[col]
y_retest_test1 = targ_retest_test1[col]
y_retest_test2 = targ_retest_test2[col]


print(y_train.name)
#nm_f= 
os.mkdir(path+'output_'+str(y_train.name))
path_out = str(path+'output_'+str(y_train.name))


#Split to local indexes for main train
index_train, index_test = train_test_split(demo_train.index, test_size=0.4, random_state=42)

#Local indices
index_train = np.array(sorted(index_train), dtype='int') #for training modalities models
index_test = np.array(sorted(index_test), dtype='int') #for testing modalities and training second level

index_retest_test = np.array(sorted(demo_retest_test2.index), dtype='int')

print(' ')
print('started to calculate ML')
print(datetime.now())
print(' ')


### 1st level ################################################################################

#### Calculations of single ML models on index_train #################################### 

print('start 1st level ', datetime.now())

#control for age+gen and age+gen+mov with sorting to index_train

#control y (target) for age+gen
p1, p2 = control_2(y_train, control_train, index_train) #where p1 = y_res (residuals), p2 = ind_y (index)
y_res1 = pd.Series(p1, index = p2)


#control modalities
features_res1 = {}
for key in features_train.keys():

    #controlling tasks for 3 parameter (age+gen+mov)
    if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']:
        features_res1[key] = control_mov_feature(features_train[key], control_train, movements_train[key], y_res1.index)

    #controlling the remaining for 2 parameters (age+gen)
    if key in ['cort', 'surf', 'subc', 'VolBrain', 'rest']:
        d = {}
        for col in features_train[key].columns:
            p1,p2 = control_2(features_train[key][col], control_train, y_res1.index)
            d[col] = p1
        df= pd.DataFrame(d, index = p2)
        features_res1[key] = df

        
#save tables
y_res1.to_csv(path_out+'/target_y_train1.csv', header=False)


for key in features_res1.keys():
    features_res1[key].to_csv(path_out+'/'+str(key)+'_train1.csv')
    pd.DataFrame((StandardScaler().fit_transform(features_res1[key])), index=features_res1[key].index, columns=features_res1[key].columns).to_csv(path_out+'/'+str(key)+'_train1_std.csv')

        
        
#keep rest residuals as a separate var
res_rest1 = features_res1['rest']
res_rest1_st = pd.DataFrame((StandardScaler().fit_transform(res_rest1)), index=res_rest1.index, columns=res_rest1.columns)

#apply PCA to resting state
pca = PCA(n_components=75, random_state=11)
pca.fit(res_rest1_st.values)
rest_pca1 = pd.DataFrame(pca.transform(res_rest1_st.values), index=res_rest1.index)



#save rest pca table
rest_pca1.to_csv(path_out+'/rest-pca75_train1.csv')
pd.DataFrame((StandardScaler().fit_transform(rest_pca1)), index=rest_pca1.index, columns=rest_pca1.columns).to_csv(path_out+'/rest-pca75_train1_std.csv')


      





### 2st level ################################################################################
print(' ')
print('start 2nd level ', datetime.now())

#### L2 Testing single ML models on index_test #############################################

print('Checking single ML on test1 data ', datetime.now())

#control for age+gen and age+gen+mov with sorting to index_test

#control y (target) for age+gen
p1, p2 = control_2(y_train, control_train, index_test)
y_res2 = pd.Series(p1, index = p2)

#control modalities
features_res2 = {}
for key in features_train.keys():

    #controlling tasks for 3 parameter (age+gen+mov)
    if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']:
        features_res2[key] = control_mov_feature(features_train[key], control_train, movements_train[key], y_res2.index)

    #controlling the remaining for 2 parameters (age+gen)
    if key in ['cort', 'surf', 'subc', 'VolBrain', 'rest']:
        d = {}
        for col in features_train[key].columns:
            p1,p2 = control_2(features_train[key][col], control_train, y_res2.index)
            d[col] = p1
        df= pd.DataFrame(d, index = p2)
        features_res2[key] = df        




#save tables
y_res2.to_csv(path_out+'/target_y_train2.csv', header=False)


for key in features_res2.keys():
    features_res2[key].to_csv(path_out+'/'+str(key)+'_train2.csv')
    pd.DataFrame((StandardScaler().fit_transform(features_res2[key])), index=features_res2[key].index, columns=features_res2[key].columns).to_csv(path_out+'/'+str(key)+'_train2_std.csv')        
        
        
        
        
#keep rest residuals as a separate var
res_rest2 = features_res2['rest']
res_rest2_st = pd.DataFrame((StandardScaler().fit_transform(res_rest2)), index=res_rest2.index, columns=res_rest2.columns)

#apply PCA to resting state
rest_pca2 = pd.DataFrame(pca.transform(res_rest2_st.values), index=res_rest2.index)




#save rest pca table
rest_pca2.to_csv(path_out+'/rest-pca75_train2.csv')
pd.DataFrame((StandardScaler().fit_transform(rest_pca2)), index=rest_pca2.index, columns=rest_pca2.columns).to_csv(path_out+'/rest-pca75_train2_std.csv')




        
        



### 3rd level ################################################################################
print(' ')
print('start 3rd level , retest1', datetime.now())


#### L3 Testing single ML models on retest1 #############################################

print('Checking single ML on retest1 data ', datetime.now())

#control for age+gen and age+gen+mov with sorting to index_retest_test

#control y (target) for age+gen
p1, p2 = control_2(y_retest_test1, control_retest_test1, index_retest_test)
y_res3 = pd.Series(p1, index = p2)

#control modalities
features_res3 = {}
for key in features_retest_test1.keys():

    #controlling tasks for 3 parameter (age+gen+mov)
    if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']:
        features_res3[key] = control_mov_feature(features_retest_test1[key], control_retest_test1, movements_retest_test1[key], y_res3.index)

    #controlling the remaining for 2 parameters (age+gen)
    if key in ['cort', 'surf', 'subc', 'VolBrain', 'rest']:
        d = {}
        for col in features_retest_test1[key].columns:
            p1,p2 = control_2(features_retest_test1[key][col], control_retest_test1, y_res3.index)
            d[col] = p1
        df= pd.DataFrame(d, index = p2)
        features_res3[key] = df        



#save tables
y_res3.to_csv(path_out+'/target_y_test1.csv', header=False)


for key in features_res3.keys():
    features_res3[key].to_csv(path_out+'/'+str(key)+'_test1.csv')
    pd.DataFrame((StandardScaler().fit_transform(features_res3[key])), index=features_res3[key].index, columns=features_res3[key].columns).to_csv(path_out+'/'+str(key)+'_test1_std.csv')





#keep rest residuals as a separate var
res_rest3 = features_res3['rest']
res_rest3_st = pd.DataFrame((StandardScaler().fit_transform(res_rest3)), index=res_rest3.index, columns=res_rest3.columns)

#apply PCA to resting state
rest_pca3 = pd.DataFrame(pca.transform(res_rest3_st.values), index=res_rest3.index)



#save rest pca table
rest_pca3.to_csv(path_out+'/rest-pca75_test1.csv')
pd.DataFrame((StandardScaler().fit_transform(rest_pca3)), index=rest_pca3.index, columns=rest_pca3.columns).to_csv(path_out+'/rest-pca75_test1_std.csv')

       
        
        





### 3rd level ################################################################################
print(' ')
print('start 3rd level , retest2', datetime.now())


#### L3 Testing single ML models on retest2 #############################################

print('Checking single ML on retest2 data ', datetime.now())

#control for age+gen and age+gen+mov with sorting to index_retest_test

#control y (target) for age+gen
p1, p2 = control_2(y_retest_test2, control_retest_test2, index_retest_test)
y_res3 = pd.Series(p1, index = p2)

#control modalities
features_res3 = {}
for key in features_retest_test2.keys():

    #controlling tasks for 3 parameter (age+gen+mov)
    if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']:
        features_res3[key] = control_mov_feature(features_retest_test2[key], control_retest_test2, movements_retest_test2[key], y_res3.index)

    #controlling the remaining for 2 parameters (age+gen)
    if key in ['cort', 'surf', 'subc', 'VolBrain', 'rest']:
        d = {}
        for col in features_retest_test2[key].columns:
            p1,p2 = control_2(features_retest_test2[key][col], control_retest_test2, y_res3.index)
            d[col] = p1
        df= pd.DataFrame(d, index = p2)
        features_res3[key] = df        



#save tables
y_res3.to_csv(path_out+'/target_y_test2.csv', header=False)


for key in features_res3.keys():
    features_res3[key].to_csv(path_out+'/'+str(key)+'_test2.csv')
    pd.DataFrame((StandardScaler().fit_transform(features_res3[key])), index=features_res3[key].index, columns=features_res3[key].columns).to_csv(path_out+'/'+str(key)+'_test2_std.csv')





#keep rest residuals as a separate var
res_rest3 = features_res3['rest']
res_rest3_st = pd.DataFrame((StandardScaler().fit_transform(res_rest3)), index=res_rest3.index, columns=res_rest3.columns)

#apply PCA to resting state
rest_pca3 = pd.DataFrame(pca.transform(res_rest3_st.values), index=res_rest3.index)


#save rest pca table
rest_pca3.to_csv(path_out+'/rest-pca75_test2.csv')
pd.DataFrame((StandardScaler().fit_transform(rest_pca3)), index=rest_pca3.index, columns=rest_pca3.columns).to_csv(path_out+'/rest-pca75_test2_std.csv')


     
        





print(' ')
print('finished to calculate')
print(datetime.now())




CogTotalComp_Unadj
 
started to calculate ML
2022-02-04 16:35:50.756374
 
start 1st level  2022-02-04 16:35:50.757056
 
start 2nd level  2022-02-04 16:37:51.379203
Checking single ML on test1 data  2022-02-04 16:37:51.379267
 
start 3rd level , retest1 2022-02-04 16:38:51.299840
Checking single ML on retest1 data  2022-02-04 16:38:51.299904
 
start 3rd level , retest2 2022-02-04 16:39:06.524629
Checking single ML on retest2 data  2022-02-04 16:39:06.525143
 
finished to calculate
2022-02-04 16:39:17.758126
