In [None]:
pip install import-ipynb

In [None]:
import pandas as pd
import import_ipynb
import numpy as np
import random
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors

from scipy import stats, interpolate
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

import os
import tensorflow as tf
from tensorflow.keras import Model, models, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D,Dense,MaxPooling1D,Flatten,Dropout,concatenate, Concatenate,GRU, concatenate, Input, LSTM, Bidirectional, LeakyReLU, BatchNormalization, ReLU
from tensorflow.keras.utils import plot_model
from tensorflow.python.client import device_lib
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import *
from tensorflow.keras.losses import MeanAbsoluteError, MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.regularizers import l2, l1_l2
import tensorflow.keras as keras
##########################
from models import get_model
from kerashypetune import KerasGridSearch
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
mae = MeanAbsoluteError()
mse = MeanSquaredError()

# Data import

In [3]:
PPGR_meal_merged = pd.read_csv('../../data/PPGR_meal_merged_cont2.csv',index_col=0) #Merged data (meal + CGM)
OTU_data = pd.read_csv('../../data/T2D_KBSMC_otu_norm.csv',index_col=0)             #16s microbiome data (taxonomic mapped read count profile)
med_data = pd.read_excel('../../data/CGM Nutrition_CRF_Medication.xlsx')            #Medication usage
clinical_data = pd.read_excel('../../data/CGM Nutrition_CRF_20230808_추가Lab.xlsx') #Clinicodemograhic data

In [7]:
# Calculating PPGR #
CGM_post60_col =  [f"p_{t:03d}" for t in range(0,61,5)]
CGM_post120_col = [f"p_{t:03d}" for t in range(0,121,5)]
CGM_post240_col = [f"p_{t:03d}" for t in range(0,241,5)]

PPGR_meal_merged[CGM_post240_col] = PPGR_meal_merged.loc[:,CGM_post240_col].interpolate(axis=1) # Fill the Nan with interpolated value

G0 = PPGR_meal_merged['p_000']
# 2h PPGR #
iAUC_2h = [0]*len(PPGR_meal_merged)
for i in range(1,25):
    Gi = PPGR_meal_merged[CGM_post120_col[i]]
    Gi_1 = PPGR_meal_merged[CGM_post120_col[i-1]]
    Si = ((Gi-G0)+(Gi_1-G0))/2
    iAUC_2h+=Si*5

# 4h PPGR #
iAUC_4h = [0]*len(PPGR_meal_merged)
for i in range(1,49):
    Gi = PPGR_meal_merged[CGM_post240_col[i]]
    Gi_1 = PPGR_meal_merged[CGM_post240_col[i-1]]
    Si = ((Gi-G0)+(Gi_1-G0))/2
    iAUC_4h+=Si*5
    
PPGR_meal_merged['PPGR_u2'] = iAUC_2h/60
PPGR_meal_merged['PPGR_u4'] = iAUC_4h/60

In [16]:
PPGR_meal_merged_filt = PPGR_meal_merged.dropna(subset=['meal_m1'],axis=0)
PPGR_meal_merged_filt = PPGR_meal_merged_filt[(PPGR_meal_merged['Energy(kcal)']<2000) & (PPGR_meal_merged['Carb(g)']<250)]
PPGR_meal_merged_filt['Carb_root'] = PPGR_meal_merged_filt['Carb(g)']**(1/2)
PPGR_meal_merged_filt['Carb_pro'] = PPGR_meal_merged_filt['Carb(g)']*4 / (PPGR_meal_merged_filt['Carb(g)']*4+PPGR_meal_merged_filt['Protein(g)']*4+PPGR_meal_merged_filt['Fat(g)']*9) * 100
PPGR_meal_merged_filt['Protein_pro'] = PPGR_meal_merged_filt['Protein(g)']*4 / (PPGR_meal_merged_filt['Carb(g)']*4+PPGR_meal_merged_filt['Protein(g)']*4+PPGR_meal_merged_filt['Fat(g)']*9) * 100
PPGR_meal_merged_filt['Fat_pro'] = PPGR_meal_merged_filt['Fat(g)']*9 / (PPGR_meal_merged_filt['Carb(g)']*4+PPGR_meal_merged_filt['Protein(g)']*4+PPGR_meal_merged_filt['Fat(g)']*9) * 100

OTU_data = OTU_data.loc[:,list(np.sum(OTU_data>0) > len(OTU_data)*0.3)]
microbiome_col = list(OTU_data.columns)
OTU_data_bc = pd.DataFrame(stats.boxcox(OTU_data,lmbda=0.25),columns=OTU_data.columns,index=OTU_data.index)

# Features
CGM_pre30_col = [f"m_{t:03d}" for t in range(30,0,-5)]+['p_000']
futureCGM_120 = [f"p_{t:03d}" for t in range(5,121,5)]
meal_composition = ['Carb_root','Protein(g)','Fat(g)','Energy(kcal)','Fiber(g)','Carb_pro','Protein_pro','Fat_pro']
meal_context = ['meal_m1','prot_b6h','Time']
cli_col = ['Age','DM_Duration','BMI','HbA1c','SBP','DBP','HDL','LDL','AST(IU/L)','ALT(IU/L)'] 
med_col = ['Basal_Ins_Dose_Unit','MFM_Dose_Total','SU_Dose_Total','DPP4i_Dose_Total','SGLT2i_Dose_Total'] 

# Final dataframe
PPGR_data =  PPGR_meal_merged_filt.dropna(subset=['PPGR_u2','PPGR_u4']+CGM_pre30_col,axis=0)
full_data = pd.merge(PPGR_data[['ID','Dname',PPGR,PPGR_4,'Carb(g)','Meal']+CGM_pre30_col+CGM_post240_col+meal_composition+meal_context],OTU_data_bc[microbiome_col],left_on='ID',right_index=True)
full_data = full_data.T.drop_duplicates().T
full_data = pd.merge(full_data,med_data,how='left',on='ID')
full_data = pd.merge(full_data,clinical_data,how='left',on='ID')
full_data['PPGR_u2'] = full_data['PPGR_u2'].astype(float)
full_data['PPGR_u4'] = full_data['PPGR_u4'].astype(float)

samples = list(full_data['ID'].unique())
full_data

Unnamed: 0,ID,Dname,PPGR_u2,PPGR_u4,Carb(g),Meal,m_030,m_025,m_020,m_015,...,TG,HDL,LDL,MDRD_eGFR,C_pep,C-peptide_index,HOMA IR (C-peptide),HR(bpm),AST(IU/L),ALT(IU/L)
0,R01,사과,114.625000,244.666667,5.75,5,131.0,135.0,135.0,131.0,...,315,32,44,86.1,1.7,4.612802,0.192700,91.5,24,39
1,R01,과일샐러드귀리흰우유,26.666667,-24.791667,60.295,1,130.0,136.0,141.0,144.0,...,315,32,44,86.1,1.7,4.612802,0.192700,91.5,24,39
2,R01,고사리나물더덕구이도토리묵무침두부된장국땅콩조림무말랭이무침배추김치숙주나물시금치나물쌀밥오...,141.750000,217.750000,136.5433,2,119.0,118.0,117.0,116.0,...,315,32,44,86.1,1.7,4.612802,0.192700,91.5,24,39
3,R01,라면사과회냉면,208.458333,414.458333,175.816,3,85.0,85.0,87.0,90.0,...,315,32,44,86.1,1.7,4.612802,0.192700,91.5,24,39
4,R01,계란:전란:삶은것국밥깍두기,168.083333,193.666667,84.41,1,104.0,101.0,99.0,98.0,...,315,32,44,86.1,1.7,4.612802,0.192700,91.5,24,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906,R50,쇠고기덮밥,60.583333,169.875000,78.797,2,190.0,186.0,181.0,175.0,...,155,45,68,107.7,1.2,5.432546,0.163623,94.0,19,21
907,R50,상추소고기 구이(등심)쌀밥,24.458333,116.250000,54.26,3,188.0,187.0,186.0,185.0,...,155,45,68,107.7,1.2,5.432546,0.163623,94.0,19,21
908,R50,감자채볶음고추멸치볶음쌀밥열무김치,-4.875000,217.458333,73.6332,1,162.0,162.0,163.0,164.0,...,155,45,68,107.7,1.2,5.432546,0.163623,94.0,19,21
909,R50,장어매운탕,-40.250000,-61.000000,38.04943,2,226.0,224.0,222.0,222.0,...,155,45,68,107.7,1.2,5.432546,0.163623,94.0,19,21


In [9]:
evaluation_data = pd.read_csv('../..//data/validation/PPGR_meal_merged_validation.csv',index_col=0)
evaluation_data =  evaluation_data.dropna(subset=CGM_pre30_col,axis=0)
evaluation_data

Unnamed: 0,ID,PPGR_u2,PPGR_u4,Carb(g),Meal,m_240,m_235,m_230,m_225,m_220,...,Bacteria;Fusobacteria;Fusobacteria_c;Fusobacteriales;Fusobacteriaceae;Fusobacterium;Fusobacterium_necrogenes_group,Bacteria;Actinobacteria;Coriobacteriia;Coriobacteriales;Coriobacteriaceae;Slackia;Slackia_isoflavoniconvertens,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;NHOC_g;PAC001240_s,Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Anaerotignum;PAC002392_s,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Sporobacter;EU779114_s,Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;PAC000195_g;DQ801499_s,Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Acetitomaculum;PAC001448_s,Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Coprococcus_g2;EF640143_s,Bacteria;Firmicutes;Clostridia;Clostridiales;Christensenellaceae;PAC001207_g;PAC002323_s,Bacteria;Firmicutes;Clostridia;Natranaerobiales;Dethiobacter_f;EF585983_g;EF585983_g_uc
0,C0001,-37.500000,10.458333,10.00,5.0,130.0,128.0,129.0,126.0,123.0,...,-4.0,-4.0,-4.00000,-4.0,-4.0,-4.000000,-4.0,-4.0,-4.0,-4.0
1,C0001,-38.583333,-135.500000,77.00,3.0,179.0,175.0,167.0,160.0,159.0,...,-4.0,-4.0,-4.00000,-4.0,-4.0,-4.000000,-4.0,-4.0,-4.0,-4.0
2,C0001,121.250000,177.416667,50.45,1.0,84.0,82.0,78.0,75.0,73.0,...,-4.0,-4.0,-4.00000,-4.0,-4.0,-4.000000,-4.0,-4.0,-4.0,-4.0
3,C0001,103.333333,368.458333,97.08,2.0,99.0,103.0,102.0,99.0,98.0,...,-4.0,-4.0,-4.00000,-4.0,-4.0,-4.000000,-4.0,-4.0,-4.0,-4.0
4,C0001,64.458333,195.875000,3.66,3.0,246.0,253.0,253.0,255.0,257.0,...,-4.0,-4.0,-4.00000,-4.0,-4.0,-4.000000,-4.0,-4.0,-4.0,-4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,C0051,35.291667,49.250000,83.15,1.0,122.0,125.0,128.0,124.0,124.0,...,-4.0,-4.0,-2.85875,-4.0,-4.0,-3.645982,-4.0,-4.0,-4.0,-4.0
1244,C0051,80.750000,154.000000,72.79,2.0,130.0,130.0,140.0,148.0,151.0,...,-4.0,-4.0,-2.85875,-4.0,-4.0,-3.645982,-4.0,-4.0,-4.0,-4.0
1245,C0051,-30.875000,-93.125000,14.10,3.0,201.0,198.0,197.0,194.0,191.0,...,-4.0,-4.0,-2.85875,-4.0,-4.0,-3.645982,-4.0,-4.0,-4.0,-4.0
1246,C0051,54.125000,112.833333,21.94,5.0,169.0,175.0,174.0,169.0,166.0,...,-4.0,-4.0,-2.85875,-4.0,-4.0,-3.645982,-4.0,-4.0,-4.0,-4.0


# LOOCV function

In [12]:
def feature_split(data):
    x_cgm = np.array(data[CGM_pre30_col]).astype(float)
    x_meal_composition = data[meal_composition].astype(float)
    x_meal_context = data[meal_context].astype(float)
    x_cli = data[cli_col].astype(float)
    x_med = data[med_col].astype(float)
    x_microbiome = data[microbiome_col].astype(float)
    
    return x_cgm, x_meal_composition, x_meal_context, x_cli, x_med ,x_microbiome

In [19]:
def LOOCV(reg,data,task):
    if task =='2h':
        PPGR = 'PPGR_u2'
    if task == '4h':
        PPGR = 'PPGR_u4'
    hyperparameter_bestscore = []
    hyperparameter_bestparam = []
    R_list = []
    rmse_list = []
    mae_list = []

    # LOOCV
    for sample in samples:
        train = data[data['ID']!=sample]
        test = data[data['ID']==sample]

        # Inner data split for gridsearch
        inner_train, inner_val = train_test_split(train, test_size=0.2, random_state=42)

        # Data preparation
        x_train_cgm, x_train_meal_composition, x_train_meal_context,x_train_cli,x_train_med,x_train_microbiome = feature_split(inner_train)
        x_val_cgm, x_val_meal_composition, x_val_meal_context,x_val_cli,x_val_med,x_val_microbiome = feature_split(inner_val)
        x_test_cgm, x_test_meal_composition, x_test_meal_context,x_test_cli,x_test_med,x_test_microbiome = feature_split(test)
        train_ppgr = np.array(inner_train[PPGR])
        val_ppgr = np.array(inner_val[PPGR])
        test_ppgr = np.array(test[PPGR])

        # Load model
        model = get_model(reg)
        
        # Grid Search
        es = EarlyStopping(patience=5, verbose=1, min_delta=0.001, monitor='val_loss', mode='auto', restore_best_weights=True)
        kgs = KerasGridSearch(model, param_grid, monitor='val_loss', greater_is_better=False, tuner_verbose=1)
        kgs.search([x_train_cgm, x_train_meal_composition, x_train_meal_context,x_train_microbiome,x_train_cli,x_train_med], train_ppgr,
                   validation_data=([x_val_cgm, x_val_meal_composition, x_val_meal_context,x_val_microbiome,x_val_cli,x_val_med], val_ppgr), 
                   callbacks=[es])
        hyperparameter_bestscore.append(kgs.best_score)
        hyperparameter_bestparam.append(kgs.best_params)

        tuned_model = kgs.best_model
        tuned_model.save(f'./LOOCV_models/tuned_{task}_multistep_model_sample_{sample}.h5')
        predicted = tuned_model.predict([x_test_cgm, x_test_meal_composition, x_test_meal_context,x_test_microbiome,x_test_cli,x_test_med])

        r, p = stats.pearsonr(test_ppgr, predicted.flatten())
        RMSE = mse(test_ppgr, predicted.flatten()).numpy()**(1/2)
        MAE = mae(test_ppgr, predicted.flatten()).numpy()
        R_list.append(r)
        rmse_list.append(RMSE)
        mae_list.append(MAE)

        print(f"Leave {sample} out validation is done. The PCC is {np.round(r,3)}")

    return R_list,rmse_list,mae_list,hyperparameter_bestscore,hyperparameter_bestparam

In [45]:
# Set target feature (task)
task='4h'

# For DNN

In [25]:
param_grid = {
                'microbiome_units' : [60, 80],
                'n_steps' : [4,6],
                'dropout_rate': [0.3, 0.5],
                'learning_rate': [0.005, 0.01],
                'epochs': 100
            }

R_list, rmse_list, mae_list, hyperparameter_bestscore, hyperparameter_bestparam = LOOCV('DNN',full_data,task)


16 trials detected for ('microbiome_units', 'n_steps', 'dropout_rate', 'learning_rate', 'epochs')

***** (1/16) *****
Search({'microbiome_units': 80, 'n_steps': 4, 'dropout_rate': 0.3, 'learning_rate': 0.01, 'epochs': 100})
Restoring model weights from the end of the best epoch: 19.
Epoch 24: early stopping
SCORE: 15500.39258 at epoch 19

***** (2/16) *****
Search({'microbiome_units': 80, 'n_steps': 4, 'dropout_rate': 0.3, 'learning_rate': 0.005, 'epochs': 100})
Restoring model weights from the end of the best epoch: 18.
Epoch 23: early stopping
SCORE: 15935.2627 at epoch 18

***** (3/16) *****
Search({'microbiome_units': 80, 'n_steps': 4, 'dropout_rate': 0.5, 'learning_rate': 0.01, 'epochs': 100})
Restoring model weights from the end of the best epoch: 17.
Epoch 22: early stopping
SCORE: 16141.99316 at epoch 17

***** (4/16) *****
Search({'microbiome_units': 80, 'n_steps': 4, 'dropout_rate': 0.5, 'learning_rate': 0.005, 'epochs': 100})
Restoring model weights from the end of the best

In [22]:
print(np.array(mae_list).mean())
print(np.array(mae_list).std())

54.176857
15.947008


In [26]:
DNN_LOOCV_summary = pd.DataFrame({'ID':samples,
                                  'R':R_list,
                                  'RMSE':rmse_list,
                                  'MAE':mae_list,
                                  'hyperparameter_best_loss':hyperparameter_bestscore,
                                  'hyperparameter_bestparams':hyperparameter_bestparam})

DNN_LOOCV_summary.to_csv(f'./DNN_{task}_LOOCV_multistep_summary.csv')

In [27]:
params_list = list(param_grid.keys())[:-1]

df = pd.DataFrame(hyperparameter_bestparam)
counts = df.groupby(params_list).size().sort_values(ascending=False)
print(counts)

microbiome_units  n_steps  dropout_rate  learning_rate
80                6        0.3           0.010            10
60                4        0.3           0.010             7
                  6        0.3           0.005             5
80                4        0.3           0.005             5
60                4        0.5           0.005             3
                  6        0.3           0.010             3
80                4        0.3           0.010             3
                           0.5           0.005             3
60                6        0.5           0.005             2
                                         0.010             2
80                4        0.5           0.010             2
                  6        0.5           0.010             2
60                4        0.5           0.010             1
80                6        0.5           0.005             1
dtype: int64


# For XGBoost

In [40]:
def LOOCV_forML(reg,data,param_grid,task):
    if task =='2h':
        PPGR = 'PPGR_u2'
    if task == '4h':
        PPGR = 'PPGR_u4'
    hyperparameter_bestscore = []
    hyperparameter_bestparam = []
    R_list = []
    rmse_list = []
    mae_list = []

    # LOOCV
    for sample in samples:
        if reg == 'carbohydrate':
            train = data[data['ID']!=sample]
            test = data[data['ID']==sample]
            x_train = train[['Carb_root']].astype(float)
            x_test = test[['Carb_root']].astype(float)
            train_ppgr = np.array(train[PPGR])
            test_ppgr = np.array(test[PPGR])

            model = get_model(reg)
            model.fit(x_train,train_ppgr)
            predicted = model.predict(x_test)
            
        else:
            train = data[data['ID']!=sample]
            test = data[data['ID']==sample]
    
            # Inner data split for gridsearch
            inner_train, inner_val = train_test_split(train, test_size=0.2, random_state=42)
            split_index = [-1 if x in inner_train.index else 0 for x in train.index]
            pds = PredefinedSplit(test_fold = split_index)
            
            # Data preparation
            train_ppgr = np.array(train[PPGR])
            test_ppgr = np.array(test[PPGR])
            x_train = train[meal_composition+meal_context+CGM_pre30_col+microbiome_col+cli_col+med_col].astype(float)
            x_test = test[meal_composition+meal_context+CGM_pre30_col+microbiome_col+cli_col+med_col].astype(float)
    
            # SVM require normalized features
            if reg == 'SVM':
                scaler = StandardScaler()
                x_train = scaler.fit_transform(x_train)
                x_test = scaler.transform(x_test)
    
           
            # Load model
            model = get_model(reg)
    
            # Grid Search
            gs = GridSearchCV(estimator = model,param_grid = param_grid, cv=pds,scoring='neg_mean_squared_error',verbose=2)
            gs.fit(x_train,train_ppgr)
            best_params, best_score = gs.best_params_,gs.best_score_
            
            hyperparameter_bestscore.append(best_score)
            hyperparameter_bestparam.append(best_params)
            tuned_model = gs.best_estimator_
            predicted = tuned_model.predict(x_test)

        r, p = stats.pearsonr(test_ppgr, predicted.flatten())
        RMSE = mse(test_ppgr, predicted.flatten()).numpy()**(1/2)
        MAE = mae(test_ppgr, predicted.flatten()).numpy()
        R_list.append(r)
        rmse_list.append(RMSE)
        mae_list.append(MAE)

        print(f"Leave {sample} out validation is done. The PCC is {np.round(r,3)}")

    return R_list,rmse_list,mae_list,hyperparameter_bestscore,hyperparameter_bestparam

In [22]:
param_grid = {
                'max_depth': [3,5],
                'n_estimators': [500,700,900],
                'learning_rate': [0.001, 0.005, 0.01],
                'subsample':[0.5]
            }

R_list, rmse_list, mae_list, hyperparameter_bestscore, hyperparameter_bestparam = LOOCV_forML('XGB',full_data,param_grid,task)

Fitting 1 folds for each of 18 candidates, totalling 18 fits
[CV] END learning_rate=0.001, max_depth=3, n_estimators=500, subsample=0.5; total time=   1.5s
[CV] END learning_rate=0.001, max_depth=3, n_estimators=700, subsample=0.5; total time=   2.1s
[CV] END learning_rate=0.001, max_depth=3, n_estimators=900, subsample=0.5; total time=   2.8s
[CV] END learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.5; total time=   2.2s
[CV] END learning_rate=0.001, max_depth=5, n_estimators=700, subsample=0.5; total time=   3.1s
[CV] END learning_rate=0.001, max_depth=5, n_estimators=900, subsample=0.5; total time=   4.0s
[CV] END learning_rate=0.005, max_depth=3, n_estimators=500, subsample=0.5; total time=   1.5s
[CV] END learning_rate=0.005, max_depth=3, n_estimators=700, subsample=0.5; total time=   2.1s
[CV] END learning_rate=0.005, max_depth=3, n_estimators=900, subsample=0.5; total time=   2.7s
[CV] END learning_rate=0.005, max_depth=5, n_estimators=500, subsample=0.5; total ti

In [23]:
XGB_LOOCV_4h_summary = pd.DataFrame({'ID':samples,
                                  'R':R_list,
                                  'RMSE':rmse_list,
                                  'MAE':mae_list,
                                  'hyperparameter_best_loss':hyperparameter_bestscore,
                                  'hyperparameter_bestparams':hyperparameter_bestparam})

XGB_LOOCV_4h_summary.to_csv(f'./XGB_{task}_LOOCV_summary.csv')

In [24]:
params_list = list(param_grid.keys())[:-1]

df = pd.DataFrame(hyperparameter_bestparam)
counts = df.groupby(params_list).size().sort_values(ascending=False)
print(counts)

max_depth  n_estimators  learning_rate
3          900           0.010            15
5          700           0.010            14
           900           0.010             9
                         0.005             6
3          700           0.010             4
5          500           0.010             1
dtype: int64


# For RandomForest

In [28]:
param_grid = {
                'max_depth': [3,5,7],
                'n_estimators': [500,700,900]
            }

R_list, rmse_list, mae_list, hyperparameter_bestscore, hyperparameter_bestparam = LOOCV_forML('RF',full_data,param_grid,task)

Fitting 1 folds for each of 9 candidates, totalling 9 fits
[CV] END ......................max_depth=3, n_estimators=500; total time=   4.9s
[CV] END ......................max_depth=3, n_estimators=700; total time=   6.9s
[CV] END ......................max_depth=3, n_estimators=900; total time=   8.8s
[CV] END ......................max_depth=5, n_estimators=500; total time=   8.8s
[CV] END ......................max_depth=5, n_estimators=700; total time=  12.1s
[CV] END ......................max_depth=5, n_estimators=900; total time=  15.6s
[CV] END ......................max_depth=7, n_estimators=500; total time=  12.9s
[CV] END ......................max_depth=7, n_estimators=700; total time=  18.0s
[CV] END ......................max_depth=7, n_estimators=900; total time=  23.1s
Leave R01 out validation is done. The PCC is 0.622
Fitting 1 folds for each of 9 candidates, totalling 9 fits
[CV] END ......................max_depth=3, n_estimators=500; total time=   4.7s
[CV] END ............

In [29]:
RF_LOOCV_2h_summary = pd.DataFrame({'ID':samples,
                                  'R':R_list,
                                  'RMSE':rmse_list,
                                  'MAE':mae_list,
                                  'hyperparameter_best_loss':hyperparameter_bestscore,
                                  'hyperparameter_bestparams':hyperparameter_bestparam})

RF_LOOCV_2h_summary.to_csv(f'./RF_{task}_LOOCV_summary.csv')

In [30]:
params_list = list(param_grid.keys())[:]

df = pd.DataFrame(hyperparameter_bestparam)
counts = df.groupby(params_list).size().sort_values(ascending=False)
print(counts)

max_depth  n_estimators
7          500             17
           900             17
           700             15
dtype: int64


In [31]:
param_grid = {
                'C': [0.01,0.1,1,10,100],
                'gamma': [0.01,0.1,1,10,100],
                'kernel':['rbf']
            }

R_list, rmse_list, mae_list, hyperparameter_bestscore, hyperparameter_bestparam = LOOCV_forML('SVM',full_data,param_grid,task)

Fitting 1 folds for each of 25 candidates, totalling 25 fits
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.01, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.01, gamma=10, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.01, gamma=100, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time=   0.0s
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .........................C=1, gamma=0.1

In [32]:
SVM_LOOCV_2h_summary = pd.DataFrame({'ID':samples,
                                  'R':R_list,
                                  'RMSE':rmse_list,
                                  'MAE':mae_list,
                                  'hyperparameter_best_loss':hyperparameter_bestscore,
                                  'hyperparameter_bestparams':hyperparameter_bestparam})

SVM_LOOCV_2h_summary.to_csv(f'./SVM_{task}_LOOCV_summary.csv')

In [33]:
params_list = list(param_grid.keys())[:]

df = pd.DataFrame(hyperparameter_bestparam)
counts = df.groupby(params_list).size().sort_values(ascending=False)
print(counts)

C    gamma  kernel
100  0.01   rbf       43
     0.10   rbf        6
dtype: int64


In [17]:
print(np.array(mae_list).mean())
print(np.array(mae_list).std())

63.0620543132718
21.777033251685097


# For Carbo single predictor

In [46]:
R_list, rmse_list, mae_list, hyperparameter_bestscore, hyperparameter_bestparam = LOOCV_forML('carbohydrate',full_data,None,task)

Leave R01 out validation is done. The PCC is 0.601
Leave R02 out validation is done. The PCC is 0.567
Leave R03 out validation is done. The PCC is 0.389
Leave R04 out validation is done. The PCC is 0.272
Leave R05 out validation is done. The PCC is 0.48
Leave R06 out validation is done. The PCC is 0.599
Leave R07 out validation is done. The PCC is 0.267
Leave R08 out validation is done. The PCC is 0.42
Leave R09 out validation is done. The PCC is 0.258
Leave R10 out validation is done. The PCC is 0.557
Leave R11 out validation is done. The PCC is 0.2
Leave R12 out validation is done. The PCC is 0.633
Leave R13 out validation is done. The PCC is 0.129
Leave R14 out validation is done. The PCC is 0.036
Leave R15 out validation is done. The PCC is 0.502
Leave R16 out validation is done. The PCC is 0.532
Leave R17 out validation is done. The PCC is 0.305
Leave R18 out validation is done. The PCC is 0.019
Leave R19 out validation is done. The PCC is 0.214
Leave R20 out validation is done. T

In [49]:
print(np.array(mae_list).mean())
print(np.array(mae_list).std())

145.14270076527848
51.091079262089444
