# Stacked Machine Learning

In [1]:
## IMPORTANT !

# In the first order need to set the number of CPU 
# for calculation before launching (depends on computer's number of cores)
n_jobs= 50

### Load libraries

In [2]:
#libraries
import pandas as pd
import numpy as np
import os
import sys
import shutil
import glob
import joblib
import warnings
from datetime import date, datetime

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr
import scipy.stats as st

from nilearn import image as nli
from nilearn import plotting

from mne.viz import plot_connectivity_circle

### Load functions

In [3]:
def control_2(z, control, index):    #age+gender+race/ethn == 4
    #z should be a series
    #control is a feature table
    #index for indexing
    
    #shrink data to local train index
    y = z.reindex(index = index)
    X = control.reindex(index = index)

    #drop Nan in target and clean this subj from features
    y = y.dropna()
    X = X.loc[y.index,:]
    ind_y = np.array(y.index)
    
    #Centralize target by y_i-y_mean
    y= pd.DataFrame([i-y.mean() for i in y], index=y.index)    
    #y_real = y
    
    #reshaping data
    X = X.values
    y = y.values.reshape(-1, 1).ravel()
    
    #fill Nan in X
    X = SimpleImputer(strategy='mean').fit_transform(X)
    
    #Standartize X
    X = StandardScaler().fit_transform(X)
    
    #Fit to the training set
    y_pred = LinearRegression().fit(X, y).predict(X)
    
    y_res = y - y_pred
    
    return y_res, ind_y

In [4]:
def control_mov_feature(z, control, mov, index): #age+gender+race/ethn+each specific task movement == 5
    #z should be a table of features
    #mov should be a series with movements for a specific modality
    
    #shrink data to local train index
    z = z.reindex(index = index)
    control = control.reindex(index = index)
    mov = mov.reindex(index = index) 
    ind = z.index
    #concal control with mov
    cont = control
    cont['mov'] = mov
    
    #loop
    dct = {}
    col_name = z.columns
    for col in col_name:
        y = z[col]
        X = cont
        
        #Centralize target by y_i-y_mean
        y= pd.DataFrame([i-y.mean() for i in y], index=y.index) 
        
        #reshaping data
        X = X.values
        y = y.values.reshape(-1, 1).ravel()

        #fill Nan in X
        X = SimpleImputer(strategy='mean').fit_transform(X)

        #Standartize X
        X = StandardScaler().fit_transform(X)

        #Fit to the training set
        y_pred = LinearRegression().fit(X, y).predict(X)

        y_res = y - y_pred
        
        dct[col] = y_res
    
    df_t = pd.DataFrame(dct, index = ind)
    
    return df_t

### Path to the tables folder

In [7]:
path='/media/DataD800/Alina/main_set/MLtables/'

### Load tables

In [8]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

#demography
demo = pd.read_csv(path+'demographics_table_new.csv', index_col=0)

#targets table
targ = pd.read_csv(path+'cognition_table.csv', index_col=0)

#features tables as dictionary
features = {
    'emo':pd.read_csv(path+'emo_table.csv', index_col=0),
    'gam':pd.read_csv(path+'gam_table.csv', index_col=0),
    'lan':pd.read_csv(path+'lan_table.csv', index_col=0),
    'mot':pd.read_csv(path+'mot_table.csv', index_col=0),
    'rel':pd.read_csv(path+'rel_table.csv', index_col=0),
    'soc':pd.read_csv(path+'soc_table.csv', index_col=0),
    'wm':pd.read_csv(path+'wm_table.csv', index_col=0),
    'cort':pd.read_csv(path+'cort_table.csv', index_col=0),
    'subc':pd.read_csv(path+'subc_table.csv', index_col=0),
    'surf':pd.read_csv(path+'surf_table.csv', index_col=0),
    'rest':pd.read_csv(path+'rest_table.csv', index_col=0),
    'VolBrain':pd.read_csv(path+'VolBrain_table.csv', index_col=0)

}

#table with movements (mean relative displacement Movement_RelativeRMS_mean.txt)
movements = pd.read_csv(path+'movement_table.csv', index_col=0)

#create tables with 4 controling parameters: gender,age, race, ethnicity
sex_coded = pd.Series(LabelEncoder().fit_transform(demo.loc[:,['Gender']]), index=demo.index, name='Gender')

control = pd.concat([sex_coded, demo.loc[:, ['Age_in_Yrs']]], axis=1) #, race_coded, ethnic_coded

In [9]:
control

Unnamed: 0_level_0,Gender,Age_in_Yrs
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1
100206,1,27
100307,0,27
100408,1,33
100610,1,27
101006,0,35
...,...,...
991267,1,30
992673,0,33
992774,1,35
993675,0,29


In [10]:
#shrink tables to same subj numers
yy = targ['CogTotalComp_Unadj'].dropna()

demo = demo.reindex(index=yy.index)
movements = movements.reindex(index=yy.index)
control = control.reindex(index=yy.index)
for key in features.keys():
    features[key] = features[key].reindex(index=yy.index)

In [11]:
control

Unnamed: 0,Gender,Age_in_Yrs
100206,1,27
100307,0,27
100408,1,33
100610,1,27
101006,0,35
...,...,...
991267,1,30
992673,0,33
992774,1,35
993675,0,29


##### Leave-P-group out based on 8-Fold CV

##### Creates only train parts. The test table can be taken from main set script

In [12]:
import warnings
warnings.filterwarnings('ignore')

#for col in targ.columns:
col = 'CogTotalComp_Unadj'  #the script adapted to be launched on table of target variables. To launch in that way you need to uncomment for loop and comment this row with col variable
y = yy#targ[col]

print(y.name)

###make folder for outputs
nmf=path+'output_'+'CogTotalComp_Unadj_norace_pca75_Flat'
os.mkdir(nmf)#str(y.name))

i=0

group_kfold = GroupKFold(n_splits=8)
for train_index, test_index in group_kfold.split(demo, groups=demo['Family_ID']): 
    
    #train_index, test_index = group_kfold.split(demo, groups=demo['family_user_def_id']).__next__()
    
    print(' ')
    print('started to calculate the Fold #', i)
    print(datetime.now())
    print(' ')

    ###create directory for specific Fold
    os.mkdir(nmf+'/Fold_'+str(i)) 
    path_out = str(nmf+'/Fold_'+str(i))

    ###Global indices
    train_index = np.array(demo.iloc[train_index].index) #for training all models
    test_index = np.array(demo.iloc[test_index].index) #for final test

    ###Split global train_Gindex to local indices
    index_train, index_test = train_test_split(train_index, test_size=0.4, random_state=42)

    ###Local indices
    index_train = np.array(sorted(index_train)) #for training modalities models
    index_test = np.array(sorted(index_test)) #for testing modalities and training RF


    ### 1st level ################################################################################

    #### Calculations of single ML models on index_train #################################### 

    print('start 1st level ', datetime.now())

    #control for age+gen and age+gen+mov with sorting to index_train

    #control y (target) for age+gen
    p1, p2 = control_2(y, control, train_index) #where p1 = y_res (residuals), p2 = ind_y (index)
    y_res1 = pd.Series(p1, index = p2)


    #control modalities
    features_res1 = {}
    for key in features.keys():
        print('controlling ', key, datetime.now())

        #controlling tasks for 5 parameter (age+gen+race/ethn+mov)
        if key in ['emo', 'gam', 'lan', 'mot', 'rel', 'soc', 'wm','0']:
            features_res1[key] = control_mov_feature(features[key], control, movements[key], y_res1.index)

        #controlling the remaining for 4 parameters (age+gen+race/ethn)
        if key in ['cort', 'surf', 'subc', 'VolBrain',  'rest']:
            d = {}
            for col in features[key].columns:
                p1,p2 = control_2(features[key][col], control, y_res1.index)
                d[col] = p1
            df= pd.DataFrame(d, index = p2)
            features_res1[key] = df
    
    
    #save tables
    y_res1.to_csv(path_out+'/target_y_trainFlat.csv', header=False)
    
    
    for key in features_res1.keys():
        features_res1[key].to_csv(path_out+'/'+str(key)+'_trainFlat.csv')
        pd.DataFrame((StandardScaler().fit_transform(features_res1[key])), index=features_res1[key].index, columns=features_res1[key].columns).to_csv(path_out+'/'+str(key)+'_trainFlat_std.csv')
    
    

    #keep rest residuals as a separate var
    res_rest1 = features_res1['rest']
    res_rest1_st = pd.DataFrame((StandardScaler().fit_transform(res_rest1)), index=res_rest1.index, columns=res_rest1.columns)

    #apply PCA to resting state
    pca = PCA(n_components=75, random_state=11)
    pca.fit(res_rest1_st.values)
    rest_pca1 = pd.DataFrame(pca.transform(res_rest1_st.values), index=res_rest1.index)
    
    
    #save rest pca table
    rest_pca1.to_csv(path_out+'/rest-pca75_trainFlat.csv')
    pd.DataFrame((StandardScaler().fit_transform(rest_pca1)), index=rest_pca1.index, columns=rest_pca1.columns).to_csv(path_out+'/rest-pca75_trainFlat_std.csv')
    
    
    

    i+=1

print(' ')
print('finished the MODEL ')
print(datetime.now())

CogTotalComp_Unadj
 
started to calculate the Fold # 0
2022-02-01 16:55:50.805028
 
start 1st level  2022-02-01 16:55:50.807412
controlling  emo 2022-02-01 16:55:50.875224
controlling  gam 2022-02-01 16:55:56.319437
controlling  lan 2022-02-01 16:56:01.731566
controlling  mot 2022-02-01 16:56:07.170788
controlling  rel 2022-02-01 16:56:12.564454
controlling  soc 2022-02-01 16:56:18.054910
controlling  wm 2022-02-01 16:56:23.527105
controlling  cort 2022-02-01 16:56:29.027800
controlling  subc 2022-02-01 16:56:31.244359
controlling  surf 2022-02-01 16:56:31.535029
controlling  rest 2022-02-01 16:56:33.738863
controlling  VolBrain 2022-02-01 17:18:17.981148
 
started to calculate the Fold # 1
2022-02-01 17:23:15.361533
 
start 1st level  2022-02-01 17:23:15.362608
controlling  emo 2022-02-01 17:23:15.387249
controlling  gam 2022-02-01 17:23:23.389477
controlling  lan 2022-02-01 17:23:31.056952
controlling  mot 2022-02-01 17:23:38.750844
controlling  rel 2022-02-01 17:23:46.592869
control