# Stacked Machine Learning

In [7]:
## IMPORTANT !

# In the first order need to set the number of CPU 
# for calculation before launching (depends on computer's number of cores)
n_jobs= 30

### Load libraries

In [8]:
#libraries
import pandas as pd
import numpy as np
import os
import sys
import shutil
import glob
import joblib
import warnings
from datetime import date, datetime

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr
import scipy.stats as st

from nilearn import image as nli
from nilearn import plotting

#from mne.viz import plot_connectivity_circle

### Load functions

In [9]:
def control_features(table_in, control, index): 
    #table_in should be a table of features, where rows - subjects, columns - features
    
    if len(table_in.values.shape) == 1: #for pd.Series # for target
        
        #shrink data to local train index
        table_in = table_in.reindex(index = index)
        control = control.reindex(index = index)
        ind = table_in.index
        
        #loop
        dct_table = {}
        dct_lin_models ={}
        dct_std_y_models ={}
        
        col='0'
        
        y = table_in #target, brain ROI
        X = control  #features, like age, sex and/or movements

        #Standartize target
        std_model_y = StandardScaler()
        std_model_y.fit(y.values.reshape(-1, 1))
        y = std_model_y.transform(y.values.reshape(-1, 1))
        
        #reshaping data
        if len(X.values.shape) == 1:
            X = X.values.reshape(-1, 1)
        else:
            X = X.values
        y = y.reshape(-1, 1).ravel()
        
        #Standartize X
        std_model = StandardScaler()
        std_model.fit(X)
        X = std_model.transform(X)

        #Fit to the training set
        model = LinearRegression()
        model.fit(X, y)
        y_pred = model.predict(X)

        y_res = y - y_pred

        dct_table[col] = y_res
        dct_lin_models[col] = model
        dct_std_y_models[col] = std_model_y

        df_table = pd.DataFrame(dct_table, index = ind)

        
    else:
            
        #shrink data to local train index
        table_in = table_in.reindex(index = index)
        control = control.reindex(index = index)
        ind = table_in.index

        #loop
        dct_table = {}
        dct_lin_models ={}
        dct_std_y_models ={}
        col_names = table_in.columns

        for col in col_names:
            y = table_in[col] #target, brain ROI
            X = control  #features, like age, sex and/or movements
            
            #Standartize target
            std_model_y = StandardScaler()
            std_model_y.fit(y.values.reshape(-1, 1))
            y = std_model_y.transform(y.values.reshape(-1, 1)) 
            
            #reshaping data
            if len(X.values.shape) == 1:
                X = X.values.reshape(-1, 1)
            else:
                X = X.values
            y = y.reshape(-1, 1).ravel()
            
            #Standartize X
            std_model = StandardScaler()
            std_model.fit(X)
            X = std_model.transform(X)

            #Fit to the training set
            model = LinearRegression()
            model.fit(X, y)
            y_pred = model.predict(X)

            y_res = y - y_pred

            dct_table[col] = y_res
            dct_lin_models[col] = model
            dct_std_y_models[col] = std_model_y

        df_table = pd.DataFrame(dct_table, index = ind)
    
    return df_table, dct_std_y_models, std_model, dct_lin_models

In [10]:
def re_control_features(table_in, control, index, dct_std_y_models, std_model, dct_lin_models):
    
    if len(table_in.values.shape) == 1: #for pd.Series # for target
        
        #shrink data to local train index
        table_in = table_in.reindex(index = index)
        control = control.reindex(index = index)
        ind = table_in.index
        
        #loop
        dct_table = {}
        
        col='0'
        
        y = table_in #target, brain ROI
        X = control  #features, like age, sex and/or movements
        
        #standartize y
        y = dct_std_y_models[col].transform(y.values.reshape(-1, 1))
        
        #reshaping data
        if len(X.values.shape) == 1:
            X = X.values.reshape(-1, 1)
        else:
            X = X.values
        y = y.reshape(-1, 1).ravel()

        #Standartize X with previous std model
        X = std_model.transform(X)

        #Fit with previous LinReg model
        y_pred =  dct_lin_models[col].predict(X)

        y_res = y - y_pred

        dct_table[col] = y_res

        df_table = pd.DataFrame(dct_table, index = ind)
        
    else:
        
        #shrink data to local train index
        table_in = table_in.reindex(index = index)
        control = control.reindex(index = index)
        ind = table_in.index

        #loop
        dct_table = {}
        col_names = table_in.columns

        for col in col_names:
            y = table_in[col] #target, brain ROI
            X = control  #features, like age, sex and/or movements

            #standartize y
            y = dct_std_y_models[col].transform(y.values.reshape(-1, 1))
            
            #reshaping data
            if len(X.values.shape) == 1:
                X = X.values.reshape(-1, 1)
            else:
                X = X.values
            y = y.reshape(-1, 1).ravel()

            #Standartize X with previous std model
            X = std_model.transform(X)

            #Fit with previous LinReg model
            y_pred =  dct_lin_models[col].predict(X)

            y_res = y - y_pred

            dct_table[col] = y_res

        df_table = pd.DataFrame(dct_table, index = ind)
        
    return df_table

In [11]:
def elnet(X, y):

    #drop Nan in target and clean this subj from features
    y = y.dropna()
    X = X.loc[y.index,:]
    ind_y = np.array(y.index)
      
    y_real=y
    
    #reshaping data
    X = X.values
    y = y.values.reshape(-1, 1).ravel()
    
    #fill Nan in X
    #X = SimpleImputer(strategy='mean').fit_transform(X)
    
    #Standartize X
    #X = StandardScaler().fit_transform(X)
    
    # Setup the pipeline steps:
    steps = [('elasticnet', ElasticNet(random_state=42))]

    # Create the pipeline: pipeline 
    pipeline = Pipeline(steps)

    # Specify the hyperparameter space
    parameters = {'elasticnet__alpha': np.logspace(-1, 2, 70),
                  'elasticnet__l1_ratio':np.linspace(0,1,25)}

    # Create the GridSearchCV object:
    gm_cv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=n_jobs)
    
    # Fit to the training set
    gm_cv.fit(X, y)
    
    #predict new y
    y_pred = gm_cv.predict(X)

    # Compute and print the metrics
    acc = gm_cv.best_score_
    bpar = gm_cv.best_params_
    model = gm_cv.best_estimator_
    mse = mean_squared_error(y_real, y_pred)
    mae = mean_absolute_error(y_real, y_pred)
    corr, _ = pearsonr(np.array(y_real.values.reshape(-1, 1).ravel(), dtype=float), np.array(y_pred, dtype=float))
            
    return bpar['elasticnet__alpha'], bpar['elasticnet__l1_ratio'], acc, mse, corr, model, y_pred, mae

In [12]:
def reaply_ElNet(X, y, model):
    # param should be pd.Series with indexes from model
    
    #drop Nan in target and clean this subj from features
    y = y.dropna()
    X = X.reindex(index =y.index)
    ind_y = np.array(y.index)  # indexes as separate variable 
    
    y_real = y

    #reshaping data
    X = X.values
    y = y.values.reshape(-1, 1).ravel()
    
    #fill Nan in X
    #X = SimpleImputer(strategy='mean').fit_transform(X)
    
    #Standartize X
    #X = StandardScaler().fit_transform(X)
    
    #predict new y
    y_pred = model.predict(X)
    
    # Compute and print the metrics
    bacc = model.score(X, y)
    mse = mean_squared_error(y_real, y_pred)
    mae = mean_absolute_error(y_real, y_pred) 
    corr, _ = pearsonr(np.array(y_real.values.reshape(-1, 1).ravel(), dtype=float), np.array(y_pred, dtype=float))
    
    return y_pred, y_real, ind_y, bacc, mse, corr, mae

### Path to the tables folder

In [13]:
path='/media/hcs-psy-narun/Alina/HCP_YA/retest/MLtables_cope/'
path_s1200 = '/media/hcs-psy-narun/Alina/HCP_YA/MLtables_cope/'

### Load tables

In [14]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

#targets table
targ_retest_test2 = pd.read_csv(path+'cognition_table.csv', index_col=0)['CogTotalComp_Unadj'].dropna()

#demography
demo_retest_test2 = pd.read_csv(path+'demographics_table.csv', index_col=0).reindex(index=targ_retest_test2.index)

#features tables as dictionary
features_retest_test2 = {
    'emo':pd.read_csv(path+'emo_table.csv', index_col=0),
    'gam':pd.read_csv(path+'gam_table.csv', index_col=0),
    'lan':pd.read_csv(path+'lan_table.csv', index_col=0),
    'mot':pd.read_csv(path+'mot_table.csv', index_col=0),
    'rel':pd.read_csv(path+'rel_table.csv', index_col=0),
    'soc':pd.read_csv(path+'soc_table.csv', index_col=0),
    'wm':pd.read_csv(path+'wm_table.csv', index_col=0),
    
    'gam_FC':pd.read_csv(path+'Task_FC_GAMBLING_group_z.csv', index_col=0),
    'lan_FC':pd.read_csv(path+'Task_FC_LANGUAGE_group_z.csv', index_col=0),
    'mot_FC':pd.read_csv(path+'Task_FC_MOTOR_group_z.csv', index_col=0),
    'rel_FC':pd.read_csv(path+'Task_FC_RELATIONAL_group_z.csv', index_col=0),
    'soc_FC':pd.read_csv(path+'Task_FC_SOCIAL_group_z.csv', index_col=0),
    'wm_FC':pd.read_csv(path+'Task_FC_WM_group_z.csv', index_col=0),
    
    'cort':pd.read_csv(path+'cort_table.csv', index_col=0),
    'subc':pd.read_csv(path+'subc_table.csv', index_col=0),
    'surf':pd.read_csv(path+'surf_table.csv', index_col=0),
    'rest':pd.read_csv(path+'Rest_FC_retest_group_z_full.csv', index_col=0),
    'VolBrain':pd.read_csv(path+'VolBrain_table.csv', index_col=0)
}

for key in features_retest_test2.keys():
    features_retest_test2[key] = features_retest_test2[key].reindex(index=targ_retest_test2.index)

#table with movements (mean relative displacement Movement_RelativeRMS_mean.txt)
movements_retest_test2 = pd.read_csv(path+'movement_table.csv', index_col=0).reindex(index=targ_retest_test2.index)

#create tables with 2 controling parameters: gender and age
sex_coded_retest_test2 = pd.Series(LabelEncoder().fit_transform(demo_retest_test2.loc[:,['Gender']]), index=demo_retest_test2.index, name='Gender')
control_retest_test2 = pd.concat([sex_coded_retest_test2, demo_retest_test2.loc[:, ['Age_in_Yrs']]], axis=1).reindex(index=targ_retest_test2.index)


In [15]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

#targets table
targ_main = pd.read_csv(path_s1200+'cognition_table.csv', index_col=0)['CogTotalComp_Unadj'].dropna()

#demography
demo_main = pd.read_csv(path_s1200+'demographics_table.csv', index_col=0).reindex(index=targ_main.index)

#features tables as dictionary
features_main = {
    'emo':pd.read_csv(path_s1200+'emo_table.csv', index_col=0),
    'gam':pd.read_csv(path_s1200+'gam_table.csv', index_col=0),
    'lan':pd.read_csv(path_s1200+'lan_table.csv', index_col=0),
    'mot':pd.read_csv(path_s1200+'mot_table.csv', index_col=0),
    'rel':pd.read_csv(path_s1200+'rel_table.csv', index_col=0),
    'soc':pd.read_csv(path_s1200+'soc_table.csv', index_col=0),
    'wm':pd.read_csv(path_s1200+'wm_table.csv', index_col=0),
    
    'gam_FC':pd.read_csv(path_s1200+'Task_FC_GAMBLING_group_z_full.csv', index_col=0),
    'lan_FC':pd.read_csv(path_s1200+'Task_FC_LANGUAGE_group_z_full.csv', index_col=0),
    'mot_FC':pd.read_csv(path_s1200+'Task_FC_MOTOR_group_z_full.csv', index_col=0),
    'rel_FC':pd.read_csv(path_s1200+'Task_FC_RELATIONAL_group_z_full.csv', index_col=0),
    'soc_FC':pd.read_csv(path_s1200+'Task_FC_SOCIAL_group_z_full.csv', index_col=0),
    'wm_FC':pd.read_csv(path_s1200+'Task_FC_WM_group_z_full.csv', index_col=0),
    
    'cort':pd.read_csv(path_s1200+'cort_table.csv', index_col=0),
    'subc':pd.read_csv(path_s1200+'subc_table.csv', index_col=0),
    'surf':pd.read_csv(path_s1200+'surf_table.csv', index_col=0),
    'rest':pd.read_csv(path_s1200+'Rest_FC_group_z_full.csv', index_col=0),
    'VolBrain':pd.read_csv(path_s1200+'VolBrain_table.csv', index_col=0)
}

for key in features_main.keys():
    features_main[key] = features_main[key].reindex(index=targ_main.index)

#table with movements (mean relative displacement Movement_RelativeRMS_mean.txt)
movements_main = pd.read_csv(path_s1200+'movement_table.csv', index_col=0)

#create tables with 2 controling parameters: gender and age
sex_coded_main = pd.Series(LabelEncoder().fit_transform(demo_main.loc[:,['Gender']]), index=demo_main.index, name='Gender')
control_main = pd.concat([sex_coded_main, demo_main.loc[:, ['Age_in_Yrs']]], axis=1).reindex(index=targ_main.index)

In [16]:
#spliting main ito train and retest_test1

demo_retest_test1 = demo_main.loc[demo_retest_test2.index,:]
targ_retest_test1 = targ_main.loc[demo_retest_test2.index]
movements_retest_test1 = movements_main.loc[demo_retest_test2.index,:]
control_retest_test1 = control_main.loc[demo_retest_test2.index,:]
features_retest_test1 = {}
for key in features_main.keys():
    features_retest_test1[key] = features_main[key].loc[demo_retest_test2.index,:]


demo_train = demo_main.drop(demo_retest_test2.index, axis=0)
targ_train = targ_main.drop(demo_retest_test2.index, axis=0)
movements_train = movements_main.drop(demo_retest_test2.index, axis=0)
control_train = control_main.drop(demo_retest_test2.index, axis=0)
features_train = {}
for key in features_main.keys():
    features_train[key] = features_main[key].drop(demo_retest_test2.index, axis=0)

In [17]:
targ_train.shape

(839,)

In [18]:
targ_retest_test1.shape

(34,)

In [19]:
targ_retest_test2.shape

(34,)

##### Leave-P-groups out based on N-Fold CV

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"
    
    


#for col in targ_train.columns:
col = 'CogTotalComp_Unadj'  
y_train = targ_train#[col].dropna()
y_retest_test1 = targ_retest_test1#[col].dropna()
y_retest_test2 = targ_retest_test2#[col].dropna()


print(y_train.name)
#nm_f= 
os.mkdir(path+'output_retest_new_enh_newADJ_new_sets_newRest_'+str(y_train.name))
path_out = str(path+'output_retest_new_enh_newADJ_new_sets_newRest_'+str(y_train.name))

index_train = y_train.index
#Split to local indexes for main train
#index_train, index_test = train_test_split(demo_train.index, test_size=0.4, random_state=42)

#Local indices
#index_train = np.array(sorted(index_train)) #for training modalities models
#index_test = np.array(sorted(index_test)) #for testing modalities and training second level

index_retest_test = np.array(sorted((set(y_retest_test1.dropna().index)).intersection(set(y_retest_test2.dropna().index))))

print(' ')
print('started to calculate ML')
print(datetime.now())
print(' ')


### 1st level ################################################################################

#### Calculations of single ML models on index_train #################################### 

print('start 1st level ', datetime.now())

#control for control table with sorting to index_train

#control y (target)
y_res1, std_targ_y, std_targ_X, linreg_targ = control_features(y_train, control_train, index_train)


#control modalities
features_res1 = {}
std_feat_y_dct = {}
std_feat_X_dct = {}
linreg_feat_dct = {}
for key in features_train.keys():
    print('controlling ', key, datetime.now())

    if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']:
        control_t = pd.concat([control_train, movements_train[key]], axis=1)
        mod_res, std_f_y, std_f_X, linreg_f = control_features(features_train[key], control_t, y_res1.index)
    else:
        mod_res, std_f_y, std_f_X, linreg_f = control_features(features_train[key], control_train, y_res1.index)

    features_res1[key] = mod_res
    std_feat_y_dct[key] = std_f_y
    std_feat_X_dct[key] = std_f_X
    linreg_feat_dct[key] = linreg_f

#save adjastment model
os.mkdir(path_out+'/adjustment_models')
#target models
joblib.dump(std_targ_y, (path_out+'/adjustment_models'+'/target_std_model_y.sav'))
joblib.dump(std_targ_X, (path_out+'/adjustment_models'+'/target_std_model_X.sav'))
joblib.dump(linreg_targ, (path_out+'/adjustment_models'+'/target_linreg.sav'))
#features model
joblib.dump(std_feat_y_dct, (path_out+'/adjustment_models'+'/features_std_model_y.sav'))
joblib.dump(std_feat_X_dct, (path_out+'/adjustment_models'+'/features_std_model_X.sav'))
joblib.dump(linreg_feat_dct, (path_out+'/adjustment_models'+'/features_linreg.sav'))


###standartize before model and keep std models
#features
std_models_features = {}
for key in features_res1.keys():
    print('standartize ', key, datetime.now())
    std_model = StandardScaler()
    std_model.fit(features_res1[key].values)
    features_res1[key] = pd.DataFrame(std_model.transform(features_res1[key].values),
                                      index=features_res1[key].index, 
                                      columns=features_res1[key].columns)
    std_models_features[key] = std_model
#target
std_model_target = StandardScaler()
std_model_target.fit(y_res1.values.reshape(-1, 1))
y_res1 = pd.DataFrame(std_model_target.transform(y_res1.values.reshape(-1, 1)),
                      index=y_res1.index)

#save 
os.mkdir(path_out+'/standartization_models')
#target
joblib.dump(std_model_target,  (path_out+'/standartization_models'+'/target_std_model.sav'))
#features
joblib.dump(std_models_features,  (path_out+'/standartization_models'+'/features_std_model.sav'))


#save features table before PCA
y_res1.to_csv(path_out+'/target_y_train1.csv')
for key in features_res1.keys():
    features_res1[key].to_csv(path_out+'/'+str(key)+'_train1.csv')


#PCA models to rest and task FC
PCA_models = {}
for key in ['rest', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']:  #, 'stroop_FC', 'faces_FC', 'facename_FC', 'mid_FC'
    print('reduction ', key, datetime.now())
    model_PCA =  PCA(n_components=75, random_state=11)
    model_PCA.fit(features_res1[key].values)
    features_res1[key] = pd.DataFrame(model_PCA.transform(features_res1[key].values), 
                                      index=features_res1[key].index)
    PCA_models[key] = model_PCA
#save PCA models
os.mkdir(path_out+'/PCA_models')
joblib.dump(PCA_models,  (path_out+'/PCA_models'+'/PCA_model.sav'))



#apply new std to PCA features again
std_PC_feature_models = {}
for key in ['rest', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']:  #, 'stroop_FC', 'faces_FC', 'facename_FC', 'mid_FC'
    print('standartize PC table ', key, datetime.now())
    std_PC_model = StandardScaler()
    std_PC_model.fit(features_res1[key].values)
    features_res1[key] = pd.DataFrame(std_PC_model.transform(features_res1[key].values),
                                      index=features_res1[key].index, 
                                      columns=features_res1[key].columns)
    std_PC_feature_models[key] = std_PC_model
    #save PCA tables
    features_res1[key].to_csv(path_out+'/'+key+'_PCA75_train1.csv')
#save std PCA models
os.mkdir(path_out+'/PCA_standardization_models')
joblib.dump(std_PC_feature_models,  (path_out+'/PCA_standardization_models'+'/std_PCA_model.sav'))








#Launch ElasticNet for all task(modalities) on index_train (1st level)

dict_tasks={}
dict_elnet_model={}
dict_ypred1={}

for key in list(features_res1.keys()):

    print('start ', str(key), datetime.now())   #print start time of calculations

    bpar1, bpar2, acc, mse, corr, model, y_pred1, mae = elnet(features_res1[key], y_res1) #ML
    dict_tasks[key] = acc, mse, mae, corr, bpar1, bpar2 
    dict_elnet_model[key] = model
    dict_ypred1[key] = y_pred1
df_tasks = pd.DataFrame(dict_tasks, index=['best score r2', 'mse', 'mae','corr', 'best alpha', 'best l1_ratio'])
df_y_pred1 = pd.DataFrame(dict_ypred1, index=y_res1.index)


###Save outputs from this step (models and all mod. perf.)

#models
for key in dict_elnet_model.keys():
    joblib.dump(dict_elnet_model[key], (path_out+'/'+str(key)+'_elnet_model.sav'))

#model performance
df_tasks.to_csv(path_out+'/1level_train_perf_elnet.csv')

#list of first level targets (observed and predicted)
df_y_pred1.to_csv(path_out+'/1level_train_y_pred_singleML.csv')






### 2st level ################################################################################
print(' ')
print('start 2nd level ', datetime.now())

#### L2 Testing single ML models on index_train #############################################

print('Checking single ML on train data ', datetime.now())

#control for control table with sorting to index_train

#control y (target)
y_res2 = re_control_features(y_train, control_train, index_train, 
                             std_targ_y, std_targ_X, linreg_targ)

#control modalities
features_res2 = {}
for key in features_train.keys():
    print('controlling ', key, datetime.now())
    if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']:
        control_t = pd.concat([control_train, movements_train[key]], axis=1)
        features_res2[key] = re_control_features(features_train[key], control_t, y_res2.index, 
                                             std_feat_y_dct[key], std_feat_X_dct[key], linreg_feat_dct[key])
    else:
        features_res2[key] = re_control_features(features_train[key], control_train, y_res2.index, 
                                             std_feat_y_dct[key], std_feat_X_dct[key], linreg_feat_dct[key])

###standartize before model and keep std models
#features
for key in features_res2.keys():
    print('standartize ', key, datetime.now())
    features_res2[key] = pd.DataFrame(std_models_features[key].transform(features_res2[key].values),
                                      index=features_res2[key].index, 
                                      columns=features_res2[key].columns)
#target
y_res2 = pd.DataFrame(std_model_target.transform(y_res2.values.reshape(-1, 1)),
                      index=y_res2.index) 

#save features table before PCA
y_res2.to_csv(path_out+'/target_y_train2.csv')
for key in features_res2.keys():
    features_res2[key].to_csv(path_out+'/'+str(key)+'_train2.csv')            


#PCA models to rest and task FC
for key in ['rest', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']:   #, 'stroop_FC', 'faces_FC', 'facename_FC', 'mid_FC'
    print('reduction ', key, datetime.now())
    features_res2[key] = pd.DataFrame(PCA_models[key].transform(features_res2[key].values), 
                              index=features_res2[key].index)


#apply new std to PCA features again
for key in ['rest', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']:   #, 'stroop_FC', 'faces_FC', 'facename_FC', 'mid_FC'
    print('standartize PCA ', key, datetime.now())
    features_res2[key] = pd.DataFrame(std_PC_feature_models[key].transform(features_res2[key].values),
                                      index=features_res2[key].index, 
                                      columns=features_res2[key].columns)
    #save std pc table
    features_res2[key].to_csv(path_out+'/'+key+'_PCA75_train2.csv')








#apply trained single models ElasticNet to new data , index_test

dict_y_pred2={}
dict_y_pred2_per={}
for key in list(features_res2.keys()):
    y_pred, y_real, ind_y, bacc, mse, corr, mae = reaply_ElNet(features_res2[key], y_res2, dict_elnet_model[key]) #ML
    dict_y_pred2[key] = y_pred
    dict_y_pred2_per[key] = bacc, mse, mae, corr

df_y_pred2 = pd.DataFrame(dict_y_pred2, index=ind_y)
df_y_pred2_per = pd.DataFrame(dict_y_pred2_per, index=['best score r2', 'mse', 'mae','corr'])


###Save outputs from this step (models and all mod. perf.)

#model performance
df_y_pred2_per.to_csv(path_out+'/2level_test1_perf_elnet.csv')

#list of first level targets (observed and predicted)
df_y_pred2.to_csv(path_out+'/2level_test1_y_pred_singleML.csv')   



#### L2 Calculating stacked ML models on index_test #############################################

print('Calculating stacked ML on train data ', datetime.now())    


#identifying sets for several stacked models
set2 = ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']
set3 = ['cort', 'subc', 'surf', 'rest', 'VolBrain']

set4 = ['gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']
set5 = ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']
set6 = ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm', 'cort', 'subc', 'surf', 'rest', 'VolBrain']
set7 = ['gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC', 'cort', 'subc', 'surf', 'rest', 'VolBrain']
set8 = ['gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC', 'rest']

set1 = list(df_y_pred2.columns) #all existed modalities

#for presetet sets
dict_st_perf1={}
dict_st_models={}
dict_st_ypred1={}
dct_std_mod_for_stack = {} #
dct_std_tab_for_stack = {} #
dct_std_tab_before_for_stack = {} #

s=1
for set_n in [set1, set2, set3, set4, set5, set6, set7, set8]:
    print('set '+str(s), datetime.now())

    st_features = df_y_pred2.loc[:,set_n]
    dct_std_tab_before_for_stack['set'+str(s)] = st_features #

    stack_std_model = StandardScaler().fit(st_features.values) 
    dct_std_mod_for_stack['set'+str(s)] = stack_std_model #

    std_st_features = pd.DataFrame(stack_std_model.transform(st_features.values), 
                                   index=st_features.index, columns=st_features.columns) 
    dct_std_tab_for_stack['set'+str(s)] = std_st_features #



    bpar1, bpar2, acc, mse, corr, model, y_pred3, mae = elnet(std_st_features, y_res2) #ML

    dict_st_perf1['set'+str(s)] = acc, mse, mae, corr, bpar1, bpar2 
    dict_st_models['set'+str(s)] = model
    dict_st_ypred1['set'+str(s)] = y_pred3
    s+=1

df_st_perf1 = pd.DataFrame(dict_st_perf1, index=['best score r2', 'mse', 'mae','corr', 'best alpha', 'best l1_ratio'])
df_st_ypred1 = pd.DataFrame(dict_st_ypred1, index=y_res2.index)        

###Save outputs from this step (models and all mod. perf.)

#models
for key in dict_st_models.keys():
    joblib.dump(dict_st_models[key], (path_out+'/'+str(key)+'_stacked_model.sav'))
for key in dct_std_mod_for_stack.keys():
    joblib.dump(dct_std_mod_for_stack[key], (path_out+'/'+str(key)+'_stacked_STD_model.sav'))

#performance and prediction
df_st_perf1.to_csv(path_out+'/2level_test1_perf_stacked.csv')
df_st_ypred1.to_csv(path_out+'/2level_test1_y_pred_stacked.csv')
for key in dct_std_tab_for_stack.keys():
    dct_std_tab_for_stack[key].to_csv(path_out+'/2level_stack_y_feature_tab_STD.csv')
    dct_std_tab_before_for_stack[key].to_csv(path_out+'/2level_stack_y_feature_tab_beforeSTD.csv')





### 3rd level ################################################################################
print(' ')
print('start 3rd level , retest1', datetime.now())


#### L3 Testing single ML models on retest1 #############################################

print('Checking single ML on retest1 data ', datetime.now())

#control for control table sorting to test_index

#control y (target)
y_res3 = re_control_features(y_retest_test1, control_retest_test1, index_retest_test, 
                             std_targ_y, std_targ_X, linreg_targ)

#control modalities
features_res3 = {}
for key in features_train.keys():
    print('controlling ', key, datetime.now())
    if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']:
        control_t = pd.concat([control_retest_test1, movements_retest_test1[key]], axis=1)
        features_res3[key] = re_control_features(features_retest_test1[key], control_t, y_res3.index, 
                                             std_feat_y_dct[key], std_feat_X_dct[key], linreg_feat_dct[key])
    else:
        features_res3[key] = re_control_features(features_retest_test1[key], control_retest_test1, y_res3.index, 
                                             std_feat_y_dct[key], std_feat_X_dct[key], linreg_feat_dct[key])

###standartize before model and keep std models
#features
for key in features_res3.keys():
    print('standartize ', key, datetime.now())
    features_res3[key] = pd.DataFrame(std_models_features[key].transform(features_res3[key].values),
                                      index=features_res3[key].index, 
                                      columns=features_res3[key].columns)
#target
y_res3 = pd.DataFrame(std_model_target.transform(y_res3.values.reshape(-1, 1)),
                      index=y_res3.index) 

#save features table before PCA
y_res3.to_csv(path_out+'/target_y_retest1.csv')
for key in features_res3.keys():
    features_res3[key].to_csv(path_out+'/'+str(key)+'_retest1.csv')            


#PCA models to rest and task FC
for key in ['rest', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']:   #, 'stroop_FC', 'faces_FC', 'facename_FC', 'mid_FC'
    print('reduction ', key, datetime.now())
    features_res3[key] = pd.DataFrame(PCA_models[key].transform(features_res3[key].values), 
                              index=features_res3[key].index)


#apply new std to PCA features again
for key in ['rest', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']:   #, 'stroop_FC', 'faces_FC', 'facename_FC', 'mid_FC'
    print('standartize PCA ', key, datetime.now())
    features_res3[key] = pd.DataFrame(std_PC_feature_models[key].transform(features_res3[key].values),
                                      index=features_res3[key].index, 
                                      columns=features_res3[key].columns)
    #save std pc table
    features_res3[key].to_csv(path_out+'/'+key+'_PCA75_retest1.csv')  





#apply trained single models ElasticNet to new data , test_index

dict_y_pred3={}
dict_y_pred3_per={}
for key in list(features_res3.keys()):
    y_pred, y_real, ind_y, bacc, mse, corr, mae = reaply_ElNet(features_res3[key], y_res3, dict_elnet_model[key]) #ML
    dict_y_pred3[key] = y_pred
    dict_y_pred3_per[key] = bacc, mse, mae, corr

df_y_pred3 = pd.DataFrame(dict_y_pred3, index=ind_y)
df_y_pred3_per = pd.DataFrame(dict_y_pred3_per, index=['best score r2', 'mse', 'mae','corr'])


###Save outputs from this step (models and all mod. perf.)

#model performance
df_y_pred3_per.to_csv(path_out+'/3level_retest1_perf_elnet.csv')

#list of first level targets (observed and predicted)
df_y_pred3.to_csv(path_out+'/3level_retest1_y_pred_singleML.csv')        


#### L3 Testing stacked ML models on test_index #############################################

print('Calculating stacked ML on retest1 data ', datetime.now()) 

#apply trained stacked models ElasticNet to new data , test_index

#for presetet sets
dict_st_perf2={}
dict_st_ypred2={}

dct_std3_tab_for_stack = {} #
dct_std3_tab_before_for_stack = {} #

s=1
for set_n in [set1, set2, set3, set4, set5, set6, set7, set8]:

    ftrs = df_y_pred3.loc[:, set_n]
    dct_std3_tab_before_for_stack['set'+str(s)] = ftrs

    std_ftrs = pd.DataFrame(dct_std_mod_for_stack['set'+str(s)].transform(ftrs.values), 
                            index=ftrs.index,columns=ftrs.columns)
    dct_std3_tab_for_stack['set'+str(s)] = std_ftrs

    y_pred, y_real, ind_y, bacc, mse, corr, mae = reaply_ElNet(std_ftrs, y_res3, dict_st_models[('set'+str(s))]) #ML
    dict_st_ypred2[('set'+str(s))] = y_pred
    dict_st_perf2[('set'+str(s))] = bacc, mse, mae, corr
    s+=1

df_st_ypred2 = pd.DataFrame(dict_st_ypred2, index=ind_y)
df_st_perf2 = pd.DataFrame(dict_st_perf2, index=['best score r2', 'mse', 'mae','corr'])        

###Save outputs from this step (models and all mod. perf.)

#performance and prediction
df_st_perf2.to_csv(path_out+'/3level_retest1_perf_stacked.csv')
df_st_ypred2.to_csv(path_out+'/3level_retest1_y_pred_stacked.csv') 
for key in dct_std3_tab_for_stack.keys():
    dct_std3_tab_for_stack[key].to_csv(path_out+'/3level_retest1_stack_y_feature_tab_STD.csv')
    dct_std3_tab_before_for_stack[key].to_csv(path_out+'/3level_retest1_stack_y_feature_tab_beforeSTD.csv')  











### 3rd level ################################################################################
print(' ')
print('start 3rd level , retest2', datetime.now())


#### L3 Testing single ML models on retest2 #############################################

print('Checking single ML on retest2 data ', datetime.now())

#control for control table with sorting to index_retest_test

#control y (target)
y_res3 = re_control_features(y_retest_test2, control_retest_test2, index_retest_test, 
                             std_targ_y, std_targ_X, linreg_targ)

#control modalities
features_res3 = {}
for key in features_train.keys():
    print('controlling ', key, datetime.now())
    if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm']:
        control_t = pd.concat([control_retest_test2, movements_retest_test2[key]], axis=1)
        features_res3[key] = re_control_features(features_retest_test2[key], control_t, y_res3.index, 
                                             std_feat_y_dct[key], std_feat_X_dct[key], linreg_feat_dct[key])
    else:
        features_res3[key] = re_control_features(features_retest_test2[key], control_retest_test2, y_res3.index, 
                                             std_feat_y_dct[key], std_feat_X_dct[key], linreg_feat_dct[key])

###standartize before model and keep std models
#features
for key in features_res3.keys():
    print('standartize ', key, datetime.now())
    features_res3[key] = pd.DataFrame(std_models_features[key].transform(features_res3[key].values),
                                      index=features_res3[key].index, 
                                      columns=features_res3[key].columns)
#target
y_res3 = pd.DataFrame(std_model_target.transform(y_res3.values.reshape(-1, 1)),
                      index=y_res3.index) 

#save features table before PCA
y_res3.to_csv(path_out+'/target_y_retest2.csv')
for key in features_res3.keys():
    features_res3[key].to_csv(path_out+'/'+str(key)+'_retest2.csv')            


#PCA models to rest and task FC
for key in ['rest', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']:   #, 'stroop_FC', 'faces_FC', 'facename_FC', 'mid_FC'
    print('reduction ', key, datetime.now())
    features_res3[key] = pd.DataFrame(PCA_models[key].transform(features_res3[key].values), 
                              index=features_res3[key].index)


#apply new std to PCA features again
for key in ['rest', 'gam_FC', 'lan_FC', 'mot_FC', 'rel_FC', 'soc_FC', 'wm_FC']:   #, 'stroop_FC', 'faces_FC', 'facename_FC', 'mid_FC'
    print('standartize PCA ', key, datetime.now())
    features_res3[key] = pd.DataFrame(std_PC_feature_models[key].transform(features_res3[key].values),
                                      index=features_res3[key].index, 
                                      columns=features_res3[key].columns)
    #save std pc table
    features_res3[key].to_csv(path_out+'/'+key+'_PCA75_retest2.csv')  





#apply trained single models ElasticNet to new data , test_index

dict_y_pred3={}
dict_y_pred3_per={}
for key in list(features_res3.keys()):
    y_pred, y_real, ind_y, bacc, mse, corr, mae = reaply_ElNet(features_res3[key], y_res3, dict_elnet_model[key]) #ML
    dict_y_pred3[key] = y_pred
    dict_y_pred3_per[key] = bacc, mse, mae, corr

df_y_pred3 = pd.DataFrame(dict_y_pred3, index=ind_y)
df_y_pred3_per = pd.DataFrame(dict_y_pred3_per, index=['best score r2', 'mse', 'mae','corr'])


###Save outputs from this step (models and all mod. perf.)

#model performance
df_y_pred3_per.to_csv(path_out+'/3level_retest2_perf_elnet.csv')

#list of first level targets (observed and predicted)
df_y_pred3.to_csv(path_out+'/3level_retest2_y_pred_singleML.csv')        


#### L3 Testing stacked ML models on test_index #############################################

print('Calculating stacked ML on retest2 data ', datetime.now()) 

#apply trained stacked models ElasticNet to new data , test_index

#for presetet sets
dict_st_perf2={}
dict_st_ypred2={}

dct_std3_tab_for_stack = {} #
dct_std3_tab_before_for_stack = {} #

s=1
for set_n in [set1, set2, set3, set4, set5, set6, set7, set8]:

    ftrs = df_y_pred3.loc[:, set_n]
    dct_std3_tab_before_for_stack['set'+str(s)] = ftrs

    std_ftrs = pd.DataFrame(dct_std_mod_for_stack['set'+str(s)].transform(ftrs.values), 
                            index=ftrs.index,columns=ftrs.columns)
    dct_std3_tab_for_stack['set'+str(s)] = std_ftrs

    y_pred, y_real, ind_y, bacc, mse, corr, mae = reaply_ElNet(std_ftrs, y_res3, dict_st_models[('set'+str(s))]) #ML
    dict_st_ypred2[('set'+str(s))] = y_pred
    dict_st_perf2[('set'+str(s))] = bacc, mse, mae, corr
    s+=1

df_st_ypred2 = pd.DataFrame(dict_st_ypred2, index=ind_y)
df_st_perf2 = pd.DataFrame(dict_st_perf2, index=['best score r2', 'mse', 'mae','corr'])        

###Save outputs from this step (models and all mod. perf.)

#performance and prediction
df_st_perf2.to_csv(path_out+'/3level_retest2_perf_stacked.csv')
df_st_ypred2.to_csv(path_out+'/3level_retest2_y_pred_stacked.csv') 
for key in dct_std3_tab_for_stack.keys():
    dct_std3_tab_for_stack[key].to_csv(path_out+'/3level_retest2_stack_y_feature_tab_STD.csv')
    dct_std3_tab_before_for_stack[key].to_csv(path_out+'/3level_retest2_stack_y_feature_tab_beforeSTD.csv')  




print(' ')
print('finished to calculate')
print(datetime.now())


