In [1]:
## IMPORTANT !

# In the first order need to set the number of CPU 
# for calculation before launching (depends on computer's number of cores)
n_jobs= 50


#libraries
import pandas as pd
import numpy as np
import os
import sys
import shutil
import glob
import joblib
import warnings
from datetime import date, datetime

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr
import scipy.stats as st

from nilearn import image as nli
from nilearn import plotting

from mne.viz import plot_connectivity_circle

In [2]:
def control_features(table_in, control, index): 
    #table_in should be a table of features, where rows - subjects, columns - features
    
    if len(table_in.values.shape) == 1: #for pd.Series # for target
        
        #shrink data to local train index
        table_in = table_in.reindex(index = index)
        control = control.reindex(index = index)
        ind = table_in.index
        
        #loop
        dct_table = {}
        dct_lin_models ={}
        dct_std_y_models ={}
        
        col='0'
        
        y = table_in #target, brain ROI
        X = control  #features, like age, sex and/or movements

        #Standartize target
        std_model_y = StandardScaler()
        std_model_y.fit(y.values.reshape(-1, 1))
        y = std_model_y.transform(y.values.reshape(-1, 1))
        
        #reshaping data
        if len(X.values.shape) == 1:
            X = X.values.reshape(-1, 1)
        else:
            X = X.values
        y = y.reshape(-1, 1).ravel()
        
        #Standartize X
        std_model = StandardScaler()
        std_model.fit(X)
        X = std_model.transform(X)

        #Fit to the training set
        model = LinearRegression()
        model.fit(X, y)
        y_pred = model.predict(X)

        y_res = y - y_pred

        dct_table[col] = y_res
        dct_lin_models[col] = model
        dct_std_y_models[col] = std_model_y

        df_table = pd.DataFrame(dct_table, index = ind)

        
    else:
            
        #shrink data to local train index
        table_in = table_in.reindex(index = index)
        control = control.reindex(index = index)
        ind = table_in.index

        #loop
        dct_table = {}
        dct_lin_models ={}
        dct_std_y_models ={}
        col_names = table_in.columns

        for col in col_names:
            y = table_in[col] #target, brain ROI
            X = control  #features, like age, sex and/or movements
            
            #Standartize target
            std_model_y = StandardScaler()
            std_model_y.fit(y.values.reshape(-1, 1))
            y = std_model_y.transform(y.values.reshape(-1, 1)) 
            
            #reshaping data
            if len(X.values.shape) == 1:
                X = X.values.reshape(-1, 1)
            else:
                X = X.values
            y = y.reshape(-1, 1).ravel()
            
            #Standartize X
            std_model = StandardScaler()
            std_model.fit(X)
            X = std_model.transform(X)

            #Fit to the training set
            model = LinearRegression()
            model.fit(X, y)
            y_pred = model.predict(X)

            y_res = y - y_pred

            dct_table[col] = y_res
            dct_lin_models[col] = model
            dct_std_y_models[col] = std_model_y

        df_table = pd.DataFrame(dct_table, index = ind)
    
    return df_table, dct_std_y_models, std_model, dct_lin_models

In [3]:
def re_control_features(table_in, control, index, dct_std_y_models, std_model, dct_lin_models):
    
    if len(table_in.values.shape) == 1: #for pd.Series # for target
        
        #shrink data to local train index
        table_in = table_in.reindex(index = index)
        control = control.reindex(index = index)
        ind = table_in.index
        
        #loop
        dct_table = {}
        
        col='0'
        
        y = table_in #target, brain ROI
        X = control  #features, like age, sex and/or movements
        
        #standartize y
        y = dct_std_y_models[col].transform(y.values.reshape(-1, 1))
        
        #reshaping data
        if len(X.values.shape) == 1:
            X = X.values.reshape(-1, 1)
        else:
            X = X.values
        y = y.reshape(-1, 1).ravel()

        #Standartize X with previous std model
        X = std_model.transform(X)

        #Fit with previous LinReg model
        y_pred =  dct_lin_models[col].predict(X)

        y_res = y - y_pred

        dct_table[col] = y_res

        df_table = pd.DataFrame(dct_table, index = ind)
        
    else:
        
        #shrink data to local train index
        table_in = table_in.reindex(index = index)
        control = control.reindex(index = index)
        ind = table_in.index

        #loop
        dct_table = {}
        col_names = table_in.columns

        for col in col_names:
            y = table_in[col] #target, brain ROI
            X = control  #features, like age, sex and/or movements

            #standartize y
            y = dct_std_y_models[col].transform(y.values.reshape(-1, 1))
            
            #reshaping data
            if len(X.values.shape) == 1:
                X = X.values.reshape(-1, 1)
            else:
                X = X.values
            y = y.reshape(-1, 1).ravel()

            #Standartize X with previous std model
            X = std_model.transform(X)

            #Fit with previous LinReg model
            y_pred =  dct_lin_models[col].predict(X)

            y_res = y - y_pred

            dct_table[col] = y_res

        df_table = pd.DataFrame(dct_table, index = ind)
        
    return df_table

##### Path 

In [4]:
path = '/media/data/Dunedin_Study_Data_Narun_P_Jan2022/New_MLTabs_OneTrain/main_set/'
path_adult = path+'IQ45_raw/'
path_child = path+'IQch_raw/'

os.mkdir(path+'IQres_raw')
os.mkdir(path+'IQres_adj')

##### Load data

In [5]:
folds = sorted(os.listdir(path_adult))
print(folds)

['Fold_0', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Fold_6']


In [6]:
#targets table
dirs='/media/data/Dunedin_Study_Data_Narun_P_Jan2022/MLTabs/'
targ = pd.read_csv(dirs+'info.csv', index_col=0).iloc[:,1:]

#demography
demo = pd.read_csv(dirs+'info.csv', index_col=0).iloc[:,0]

#create tables with 1 controling parameters: bio sex
sex_coded = pd.Series(LabelEncoder().fit_transform(demo.values), index=demo.index, name='sex')

control = sex_coded #

##### Do fold by fold calculation for residuals

In [18]:
#Adjust Adults IQ to Child IQ
#
for fold in folds:
    
    os.mkdir(path+'IQres_raw'+'/'+fold)
    
    #load tables
    
    #adult IQ
    train_ad = pd.read_csv(path_adult+fold+'/target_y_train1.csv', index_col=0)
    test_ad = pd.read_csv(path_adult+fold+'/target_y_test.csv', index_col=0)
    #child's IQ
    train_ch = pd.read_csv(path_child+fold+'/target_y_train1.csv', index_col=0)
    test_ch = pd.read_csv(path_child+fold+'/target_y_test.csv', index_col=0)
    
    
    ###train set
    
    #control y (target,adIQ) for chIQ
    y_res1, std_targ_y, std_targ_X, linreg_targ = control_features(train_ad, train_ch, train_ad.index)
        
    ###standartize before model and keep std models
    #target
    std_model_target = StandardScaler()
    std_model_target.fit(y_res1.values.reshape(-1, 1))
    
    y_res1 = pd.DataFrame(std_model_target.transform(y_res1.values.reshape(-1, 1)),
                          index=y_res1.index)
    #save y_res1
    y_res1.columns = ['IQ_res']
    y_res1.to_csv(path+'IQres_raw'+'/'+fold+'/'+'/target_y_train1.csv')
    
    
    ###test set
        
    #control y (target,adIQ) for chIQ
    y_res2 = re_control_features(test_ad, test_ch, test_ad.index, 
                                 std_targ_y, std_targ_X, linreg_targ)

    ###standartize before model and keep std models
    #target
    y_res2 = pd.DataFrame(std_model_target.transform(y_res2.values.reshape(-1, 1)),
                          index=y_res2.index)
    #save y_res2
    y_res2.columns = ['IQ_res']
    y_res2.to_csv(path+'IQres_raw'+'/'+fold+'/'+'/target_y_test.csv')

    
    

In [19]:
#Adjust IQ residuals to control (bio sex)
#
for fold in folds:
    
    os.mkdir(path+'IQres_adj'+'/'+fold)
    
    train1 = pd.read_csv(path+'IQres_raw'+'/'+fold+'/'+'/target_y_train1.csv', index_col=0)
    test = pd.read_csv(path+'IQres_raw'+'/'+fold+'/'+'/target_y_test.csv', index_col=0)
    
    #control y (target) for age+gen
    y_res1, std_targ_y, std_targ_X, linreg_targ = control_features(train1, control, train1.index)
    
        
    ###standartize before model and keep std models
    #target
    std_model_target = StandardScaler()
    std_model_target.fit(y_res1.values.reshape(-1, 1))
    y_res1 = pd.DataFrame(std_model_target.transform(y_res1.values.reshape(-1, 1)),
                          index=y_res1.index)

    
    #save y_res1
    y_res1.to_csv(path+'IQres_adj'+'/'+fold+'/'+'/target_y_train1.csv')
    
    
    for subset, sset in zip([test], ['test']):
        
        #control y (target) for age+gen
        y_res2 = re_control_features(subset, control, subset.index, 
                                     std_targ_y, std_targ_X, linreg_targ)
        
        ###standartize before model and keep std models
        #target
        y_res2 = pd.DataFrame(std_model_target.transform(y_res2.values.reshape(-1, 1)),
                              index=y_res2.index)
        
        y_res2.to_csv(path+'IQres_adj'+'/'+fold+'/'+'/target_y_'+sset+'.csv')

    
    