In [5]:
''' 
Notebook that takes the splittet data from "Split_data.ipynb" and model the data using feed forward neural network on descriptorset including DS3
Thus DS3, DS13 and DS23 and DS123 are being processed in this notebook
Output: RMSE on the testsets exported for visualization in "Figures.ipynb"

'''

import sys, os
sys.path.append('../src/insulin_pk/utils/') 
import pickle  
import torch
import optuna
import random
import numpy as np
import math
import pandas as pd
import warnings
from torch import nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
# Import own modules:
from utils import *
# Supress optuna outputs and torch userwarnings
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore", category=UserWarning)

# Set seed
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


## Load data in folds and select only relevant descriptorset:
DS12_folds = pickle.load(open('../data/processed/Data_folds.pkl','rb'))
DS1_folds = pickle.load(open('../data/processed/Data_folds.pkl','rb'))
DS2_folds = pickle.load(open('../data/processed/Data_folds.pkl','rb'))
DS3_folds = pickle.load(open('../data/processed/Data_folds.pkl','rb'))


for i in range(len(DS3_folds)):
    for j in range(3):
        DS12_folds[i][j] = contruct_descriptor_sets(DS12_folds[i][j],pd.Series("12"))
        DS1_folds[i][j] = contruct_descriptor_sets(DS1_folds[i][j],pd.Series("1"))
        DS2_folds[i][j] = contruct_descriptor_sets(DS2_folds[i][j],pd.Series("2"))
        DS3_folds[i][j] = contruct_descriptor_sets(DS3_folds[i][j],pd.Series("3"))


PK_names = ['CL[ml/min/kg]', 'T1/2[h]', 'MRT[h]']

# Set training/validation epochs and number of bayesian optimization rounds
EPOCH = 200
N_TRIALS = 30

# ANN on DS123

In [6]:
# Loop over test folds DS123
CV_folds_test_vivo = {}
for i in range(len(DS12_folds)): 
    print("=================BEGINNING FOLD {0} ==============".format(i+1))
    
    
    X_train, Y_train = DS12_folds[i][0],DS12_folds[i][3]  
    X_val,Y_val = DS12_folds[i][1],DS12_folds[i][4]
    X_test,Y_test = DS12_folds[i][2],DS12_folds[i][5]
    scaler_Y = pickle.load(open('../data/processed/Scaler_Y_{0}.pkl'.format(i),'rb'))
    
    X_train_seq = DS3_folds[i][0].astype(np.float64)
    X_train_seq.set_index(X_train.index,inplace = True)
    X_val_seq = DS3_folds[i][1].astype(np.float64)
    X_val_seq.set_index(X_val.index,inplace = True)
    X_test_seq = DS3_folds[i][2].astype(np.float64)
    X_test_seq.set_index(X_test.index,inplace = True)
    
    
    dataset_train = Dataset_seq_embeddings(X_train,X_train_seq,Y_train)
    dataset_val = Dataset_seq_embeddings(X_val,X_val_seq,Y_val)
    dataset_test = Dataset_seq_embeddings(X_test,X_test_seq,Y_test)
    

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial:objective_DS123(trial,Data_train = dataset_train ,Data_Val = dataset_val ,
                                              Scaler_Y= scaler_Y,EPOCH=EPOCH,save_model=False,save_path = "../models/VIVO_FFNN_DS123_fold{0}.pt".format(i+1),
                                              X_length = 1280,
                                              max_pool_kernel_size = 4),
                   
                   n_trials = N_TRIALS)
    
    
    trial_ = study.best_trial
    with open('../models/Optuna_DS123_fold{0}.pkl'.format(i),'wb') as f:pickle.dump(trial_.params,f )
    print("Best hyperparameters for fold {0} is saved".format(i+1))
    print(f" best parameters for this fold {trial_.params}")
    best_params = pickle.load(open('../models/Optuna_DS123_fold{0}.pkl'.format(i),'rb'))
    # Build model using best hyperparameters:
    model = model_DS123_build(best_params,input_dim_desc = DS12_folds[0][0].shape[1], X_length = 1280,stride_CNN = 1,
                 conv_dilation1 = 1,padding1 = 0, max_pool_kernel_size = 4)
    #model.load_state_dict(torch.load("/home/kyei/Project1/Model files/VIVO_FFNN_DS123_fold{0}.pt".format(i+1)))
    
    # Train model (again) on best hyperparameters for train/val diagnostic plots:
    #print("Re-training best model")
    Vivo_train_results = train_and_validate_1CNN(Params = best_params,Model = model,Data_train = dataset_train ,Data_Val = dataset_val,
                                            scaler_Y = scaler_Y,EPOCH = EPOCH,save_model = True,save_path = "../models/VIVO_FFNN_DS123_fold{0}.pt".format(i))
    
    # Make test set into dataloader:
    test_loader = DataLoader(dataset = dataset_test,batch_size=X_test.shape[0],shuffle=False,drop_last = True)
    
    # Use best model to evaluate on unseen test data:
    Vivo_test_results = test_1CNN(model,best_params, test_loader,scaler_Y = scaler_Y,save_path = "../models/VIVO_FFNN_DS123_fold{0}.pt".format(i) )
    
    CV_folds_test_vivo[i] = Vivo_test_results
with open("../data/processed/ANN_outer_5_test_DS123.pkl",'wb') as f:pickle.dump(CV_folds_test_vivo,f )
    

Best hyperparameters for fold 1 is saved
 best parameters for this fold {'lr': 1.1264552770934214e-06, 'Batch_Size': 26, 'wd': 0.09820714364457325, 'conv1_filters': 13, 'conv2_filters': 31, 'Kernel_size1': 24, 'dropout_FFNN': 0.1475908244557354, 'dropout_CNN': 0.006251469497655004, 'FC_after_CNN': 297, 'FC_After_DS12': 25, 'FC_Concatenation': 108}
Best hyperparameters for fold 2 is saved
 best parameters for this fold {'lr': 9.53032421691342e-06, 'Batch_Size': 38, 'wd': 0.01997255814814223, 'conv1_filters': 7, 'conv2_filters': 24, 'Kernel_size1': 6, 'dropout_FFNN': 0.49528384117119545, 'dropout_CNN': 0.531051882936986, 'FC_after_CNN': 77, 'FC_After_DS12': 10, 'FC_Concatenation': 179}
Best hyperparameters for fold 3 is saved
 best parameters for this fold {'lr': 2.956081389686663e-06, 'Batch_Size': 59, 'wd': 0.02693182529914402, 'conv1_filters': 24, 'conv2_filters': 7, 'Kernel_size1': 9, 'dropout_FFNN': 0.5477415819886495, 'dropout_CNN': 0.19609063154457387, 'FC_after_CNN': 204, 'FC_Aft

# ANN on DS13

In [7]:
# Loop over test folds DS13
CV_folds_test_vivo = {}
for i in range(len(DS12_folds)): 
    print("=================BEGINNING FOLD {0} ==============".format(i+1))
    
    
    X_train, Y_train = DS1_folds[i][0],DS1_folds[i][3]  
    X_val,Y_val = DS1_folds[i][1],DS1_folds[i][4]
    X_test,Y_test = DS1_folds[i][2],DS1_folds[i][5]
    scaler_Y = pickle.load(open('../data/processed/Scaler_Y_{0}.pkl'.format(i),'rb'))
    
    X_train_seq = DS3_folds[i][0].astype(np.float64)
    X_train_seq.set_index(X_train.index,inplace = True)
    X_val_seq = DS3_folds[i][1].astype(np.float64)
    X_val_seq.set_index(X_val.index,inplace = True)
    X_test_seq = DS3_folds[i][2].astype(np.float64)
    X_test_seq.set_index(X_test.index,inplace = True)
    
    
    dataset_train = Dataset_seq_embeddings(X_train,X_train_seq,Y_train)
    dataset_val = Dataset_seq_embeddings(X_val,X_val_seq,Y_val)
    dataset_test = Dataset_seq_embeddings(X_test,X_test_seq,Y_test)
    

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial:objective_DS123(trial,Data_train = dataset_train ,Data_Val = dataset_val ,
                                              Scaler_Y= scaler_Y,EPOCH=EPOCH,save_model=False,save_path = "../models/VIVO_FFNN_DS13_fold{0}.pt".format(i+1),
                                              X_length = 1280,
                                              max_pool_kernel_size = 4),
                   n_trials = N_TRIALS)
    
    trial_ = study.best_trial
    with open('../models/Optuna_DS13_fold{0}.pkl'.format(i),'wb') as f:pickle.dump(trial_.params,f )
    print("Best hyperparameters for fold {0} is saved".format(i+1))
    print(f" best parameters for this fold {trial_.params}")
    best_params = pickle.load(open('../models/Optuna_DS13_fold{0}.pkl'.format(i),'rb'))
    # Build model using best hyperparameters:
    model = model_DS123_build(best_params,input_dim_desc = DS1_folds[0][0].shape[1], X_length = 1280,stride_CNN = 1,
                 conv_dilation1 = 1,padding1 = 0, max_pool_kernel_size = 4)
    #model.load_state_dict(torch.load("/home/kyei/Project1/Model files/VIVO_FFNN_DS123_fold{0}.pt".format(i+1)))
    
    # Train model (again) on best hyperparameters for train/val diagnostic plots:
    #print("Re-training best model")
    Vivo_train_results = train_and_validate_1CNN(Params = best_params,Model = model,Data_train = dataset_train ,Data_Val = dataset_val,
                                            scaler_Y = scaler_Y,EPOCH = EPOCH,save_model = True,save_path = "../models/VIVO_FFNN_DS13_fold{0}.pt".format(i))
    
    # Make test set into dataloader:
    test_loader = DataLoader(dataset = dataset_test,batch_size=X_test.shape[0],shuffle=False,drop_last = True)
    
    # Use best model to evaluate on unseen test data:
    Vivo_test_results = test_1CNN(model,best_params, test_loader,scaler_Y = scaler_Y,save_path = "../models/VIVO_FFNN_DS13_fold{0}.pt".format(i) )
    
    CV_folds_test_vivo[i] = Vivo_test_results
with open("../data/processed/ANN_outer_5_test_DS13.pkl",'wb') as f:pickle.dump(CV_folds_test_vivo,f )

Best hyperparameters for fold 1 is saved
 best parameters for this fold {'lr': 0.00031669286821867933, 'Batch_Size': 43, 'wd': 0.05356013707206362, 'conv1_filters': 11, 'conv2_filters': 17, 'Kernel_size1': 17, 'dropout_FFNN': 0.36038155980729325, 'dropout_CNN': 0.11831898707823915, 'FC_after_CNN': 58, 'FC_After_DS12': 7, 'FC_Concatenation': 173}
Best hyperparameters for fold 2 is saved
 best parameters for this fold {'lr': 0.000806034805267031, 'Batch_Size': 32, 'wd': 0.008923520988573453, 'conv1_filters': 8, 'conv2_filters': 8, 'Kernel_size1': 22, 'dropout_FFNN': 0.20244355793914698, 'dropout_CNN': 0.505830198012171, 'FC_after_CNN': 261, 'FC_After_DS12': 29, 'FC_Concatenation': 155}
Best hyperparameters for fold 3 is saved
 best parameters for this fold {'lr': 4.826451234666785e-05, 'Batch_Size': 29, 'wd': 0.017180568205168823, 'conv1_filters': 6, 'conv2_filters': 20, 'Kernel_size1': 28, 'dropout_FFNN': 0.4969463969892908, 'dropout_CNN': 0.29261803195952346, 'FC_after_CNN': 230, 'FC_A

# ANN on DS23

In [8]:
# Loop over test folds DS23
CV_folds_test_vivo = {}
for i in range(len(DS2_folds)): 
    print("=================BEGINNING FOLD {0} ==============".format(i+1))
    
    
    X_train, Y_train = DS2_folds[i][0],DS2_folds[i][3]  
    X_val,Y_val = DS2_folds[i][1],DS2_folds[i][4]
    X_test,Y_test = DS2_folds[i][2],DS2_folds[i][5]
    scaler_Y = pickle.load(open('../data/processed/Scaler_Y_{0}.pkl'.format(i),'rb'))
    
    X_train_seq = DS3_folds[i][0].astype(np.float64)
    X_train_seq.set_index(X_train.index,inplace = True)
    X_val_seq = DS3_folds[i][1].astype(np.float64)
    X_val_seq.set_index(X_val.index,inplace = True)
    X_test_seq = DS3_folds[i][2].astype(np.float64)
    X_test_seq.set_index(X_test.index,inplace = True)
    
    
    dataset_train = Dataset_seq_embeddings(X_train,X_train_seq,Y_train)
    dataset_val = Dataset_seq_embeddings(X_val,X_val_seq,Y_val)
    dataset_test = Dataset_seq_embeddings(X_test,X_test_seq,Y_test)
    

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial:objective_DS123(trial,Data_train = dataset_train ,Data_Val = dataset_val ,
                                              Scaler_Y= scaler_Y,EPOCH=EPOCH,save_model=False,save_path = "../models/VIVO_FFNN_DS23_fold{0}.pt".format(i+1),
                                              X_length = 1280,
                                              max_pool_kernel_size = 4), 
                                              n_trials = N_TRIALS)
    trial_ = study.best_trial
    with open('../models/Optuna_DS23_fold{0}.pkl'.format(i),'wb') as f:pickle.dump(trial_.params,f )
    print("Best hyperparameters for fold {0} is saved".format(i+1))
    print(f" best parameters for this fold {trial_.params}")
    best_params = pickle.load(open('../models/Optuna_DS23_fold{0}.pkl'.format(i),'rb'))
    # Build model using best hyperparameters:
    model = model_DS123_build(best_params,input_dim_desc = DS2_folds[0][0].shape[1], X_length = 1280,stride_CNN = 1,
                 conv_dilation1 = 1,padding1 = 0, max_pool_kernel_size = 4)
    #model.load_state_dict(torch.load("/home/kyei/Project1/Model files/VIVO_FFNN_DS123_fold{0}.pt".format(i+1)))
    
    # Train model (again) on best hyperparameters for train/val diagnostic plots:
    #print("Re-training best model")
    Vivo_train_results = train_and_validate_1CNN(Params = best_params,Model = model,Data_train = dataset_train ,Data_Val = dataset_val,
                                            scaler_Y = scaler_Y,EPOCH = EPOCH,save_model = True,save_path = "../models/VIVO_FFNN_DS23_fold{0}.pt".format(i))
    
    # Make test set into dataloader:
    test_loader = DataLoader(dataset = dataset_test,batch_size=X_test.shape[0],shuffle=False,drop_last = True)
    
    # Use best model to evaluate on unseen test data:
    Vivo_test_results = test_1CNN(model,best_params, test_loader,scaler_Y = scaler_Y,save_path = "../models/VIVO_FFNN_DS23_fold{0}.pt".format(i) )
    
    CV_folds_test_vivo[i] = Vivo_test_results
with open("../data/processed/ANN_outer_5_test_DS23.pkl",'wb') as f:pickle.dump(CV_folds_test_vivo,f )

Best hyperparameters for fold 1 is saved
 best parameters for this fold {'lr': 0.0007729051472160169, 'Batch_Size': 57, 'wd': 0.0775716293027236, 'conv1_filters': 18, 'conv2_filters': 15, 'Kernel_size1': 29, 'dropout_FFNN': 0.1397012527511596, 'dropout_CNN': 0.15833781475096448, 'FC_after_CNN': 152, 'FC_After_DS12': 9, 'FC_Concatenation': 89}
Best hyperparameters for fold 2 is saved
 best parameters for this fold {'lr': 1.0540231308127487e-06, 'Batch_Size': 30, 'wd': 0.032030188691277876, 'conv1_filters': 14, 'conv2_filters': 5, 'Kernel_size1': 6, 'dropout_FFNN': 0.4560926634618624, 'dropout_CNN': 0.1300447318733211, 'FC_after_CNN': 225, 'FC_After_DS12': 12, 'FC_Concatenation': 159}
Best hyperparameters for fold 3 is saved
 best parameters for this fold {'lr': 1.1556067568352983e-05, 'Batch_Size': 38, 'wd': 0.007757898143108455, 'conv1_filters': 14, 'conv2_filters': 20, 'Kernel_size1': 22, 'dropout_FFNN': 0.03563629578045857, 'dropout_CNN': 0.4766277766303989, 'FC_after_CNN': 270, 'FC_

# ANN on DS3

In [9]:
# Loop over test folds DS3
CV_folds_test_vivo = {}
for i in range(len(DS12_folds)): 
    print("=================BEGINNING FOLD {0} ==============".format(i+1))
    
    
    X_train, Y_train = DS2_folds[i][0],DS2_folds[i][3]  
    X_val,Y_val = DS2_folds[i][1],DS2_folds[i][4]
    X_test,Y_test = DS2_folds[i][2],DS2_folds[i][5]
    
    ## Set all numeric entries to 0 (as we only have sequential data in this case). Ineffective but valid if we want to keep using the current framework and functions:
    X_train = pd.DataFrame(0.0, index=X_train.index, columns=X_train.columns)
    X_val = pd.DataFrame(0.0, index=X_val.index, columns=X_val.columns)
    X_test = pd.DataFrame(0.0, index=X_test.index, columns=X_test.columns)
    
    
    scaler_Y = pickle.load(open('../data/processed/Scaler_Y_{0}.pkl'.format(i),'rb'))
    
    X_train_seq = DS3_folds[i][0].astype(np.float64)
    X_train_seq.set_index(X_train.index,inplace = True)
    X_val_seq = DS3_folds[i][1].astype(np.float64)
    X_val_seq.set_index(X_val.index,inplace = True)
    X_test_seq = DS3_folds[i][2].astype(np.float64)
    X_test_seq.set_index(X_test.index,inplace = True)
    
    
    dataset_train = Dataset_seq_embeddings(X_train,X_train_seq,Y_train)
    dataset_val = Dataset_seq_embeddings(X_val,X_val_seq,Y_val)
    dataset_test = Dataset_seq_embeddings(X_test,X_test_seq,Y_test)
    

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial:objective_DS123(trial,Data_train = dataset_train ,Data_Val = dataset_val ,
                                              Scaler_Y= scaler_Y,EPOCH=EPOCH,save_model=False,save_path = "../models/VIVO_FFNN_DS3_fold{0}.pt".format(i+1),
                                              X_length = 1280,
                                              max_pool_kernel_size = 4),
                                              n_trials = N_TRIALS)
    trial_ = study.best_trial
    with open('../models/Optuna_DS3_fold{0}.pkl'.format(i),'wb') as f:pickle.dump(trial_.params,f )
    print("Best hyperparameters for fold {0} is saved".format(i+1))
    print(f" best parameters for this fold {trial_.params}")
    best_params = pickle.load(open('../models/Optuna_DS3_fold{0}.pkl'.format(i),'rb'))
    # Build model using best hyperparameters:
    model = model_DS123_build(best_params,input_dim_desc = DS2_folds[0][0].shape[1], X_length = 1280,stride_CNN = 1,
                 conv_dilation1 = 1,padding1 = 0, max_pool_kernel_size = 4)
    #model.load_state_dict(torch.load("/home/kyei/Project1/Model files/VIVO_FFNN_DS123_fold{0}.pt".format(i+1)))
    
    # Train model (again) on best hyperparameters for train/val diagnostic plots:
    #print("Re-training best model")
    Vivo_train_results = train_and_validate_1CNN(Params = best_params,Model = model,Data_train = dataset_train ,Data_Val = dataset_val,
                                            scaler_Y = scaler_Y,EPOCH = EPOCH,save_model = True,save_path = "../models/VIVO_FFNN_DS3_fold{0}.pt".format(i))
    
    # Make test set into dataloader:
    test_loader = DataLoader(dataset = dataset_test,batch_size=X_test.shape[0],shuffle=False,drop_last = True)
    
    # Use best model to evaluate on unseen test data:
    Vivo_test_results = test_1CNN(model,best_params, test_loader,scaler_Y = scaler_Y,save_path = "../models/VIVO_FFNN_DS3_fold{0}.pt".format(i) )
    
    CV_folds_test_vivo[i] = Vivo_test_results
with open("../data/processed/ANN_outer_5_test_DS3.pkl",'wb') as f:pickle.dump(CV_folds_test_vivo,f )

Best hyperparameters for fold 1 is saved
 best parameters for this fold {'lr': 0.006097975885013286, 'Batch_Size': 46, 'wd': 0.08790131044322197, 'conv1_filters': 18, 'conv2_filters': 7, 'Kernel_size1': 15, 'dropout_FFNN': 0.5100887358822496, 'dropout_CNN': 0.0058940720826566205, 'FC_after_CNN': 66, 'FC_After_DS12': 27, 'FC_Concatenation': 91}
Best hyperparameters for fold 2 is saved
 best parameters for this fold {'lr': 2.233635207418317e-06, 'Batch_Size': 50, 'wd': 0.09962402941452857, 'conv1_filters': 7, 'conv2_filters': 28, 'Kernel_size1': 26, 'dropout_FFNN': 0.5329785899636839, 'dropout_CNN': 0.13651865270840752, 'FC_after_CNN': 64, 'FC_After_DS12': 17, 'FC_Concatenation': 175}
Best hyperparameters for fold 3 is saved
 best parameters for this fold {'lr': 0.0022418643070009677, 'Batch_Size': 15, 'wd': 0.04313223896361876, 'conv1_filters': 2, 'conv2_filters': 31, 'Kernel_size1': 19, 'dropout_FFNN': 0.39062543256405874, 'dropout_CNN': 0.003860266075165741, 'FC_after_CNN': 64, 'FC_Af