## JCP Surrodash K-Fold Cross Validation Example Notebook
The following two blocks are used for loading the correct files and dependencies from the Github repo to be able to run the ML modelling code.

In [1]:
#### ALL NOTEBOOK SHOULD HAVE SOME VERSION OF THIS #####################################
########################################################################################
%load_ext autoreload
%autoreload 2
import os
import sys

currentdir = os.getcwd()
# go to root directory. change the # of os.path.dirnames based on where currentdir is
parentdir = os.path.dirname(currentdir)
# chek where I'm at. if I go too far up the tree, go back
if 'Protein-Purification-Model-Public' not in parentdir: parentdir = currentdir
if parentdir not in sys.path: sys.path.insert(0,parentdir)
########################################################################################

In [2]:
# import py_files

import utils
import visualization.simple_data_vis as vis
import surrogate_models.nn_defs as engine

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [3]:
# load data from just-private/data
filename = 'mol_res_scan_results_7.csv'
data = utils.load_data(parentdir, filename)

# since currently data is just one big dataframe, select model inputs as X and purity, yield as Y
x = [*data.columns[:2],*data.columns[4:]]
y = data.columns[2:4]

In [7]:
# here we first clean the data and then use the data_pipeline function to set up 5 separate folds for validation
CV = 5
data2split, validation = utils.chroma_train_test_split(data, test_size=0.20)
trains, tests = utils.data_pipeline([data2split,], x_data=x, y_data=y, cross_val = CV)

In [10]:
models = []
for i in range(CV):
    dlr = engine.create_deterministic_linear_regressor(
        feature_names = x,
        target_names = y,
        name = 'DLR_'+str(i)+'_'+filename[:-4]
    )

    pnn = engine.create_probabilistic_nn(
        feature_names = x,
        target_names = y,
        hidden_units = [16,8,4,],
        name = 'PNN_'+str(i)+'_'+filename[:-4],
    )

    models.append([dlr, pnn])

In [11]:
dlr.summary()

Model: "DLR_4_mol_res_scan_results_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 18)]         0                                            
__________________________________________________________________________________________________
dense_20 (Dense)                (None, 1)            19          input[0][0]                      
__________________________________________________________________________________________________
yield (Dense)                   (None, 1)            2           dense_20[0][0]                   
__________________________________________________________________________________________________
purity (Dense)                  (None, 1)            2           dense_20[0][0]                   
Total params: 23
Trainable params: 23
Non-trainable params: 0
_________

In [17]:
# train all the models under the same conditions
learning_rate = 0.01
epochs = 100
optimizer = 'Adam'
losses = ['mean_squared_error', engine.negative_loglikelihood]*2
loss_weights = (1/trains[0][0][1].mean().div(trains[0][0][1].mean().max())).round(2).to_dict()
histories = {}

# here you're determining the MSE for each separate cross-validation

for i in range(CV):
    print('CV round '+str(i))
    for m,l in zip(models[i], losses):
        histories[utils.get_model_name(m,filename)] = engine.run_experiment(
            model = m, 
            loss = {y[0]:l,y[1]:l},
            loss_weights = loss_weights,
            optimizer = tf.keras.optimizers.Adam,
            learning_rate = learning_rate,
            num_epochs = epochs,
            train_dataset = trains[0][i], 
            test_dataset = tests[0][i],
            verbose = 0,
            log = 0
            )

settings = {'learning_rate' : learning_rate,
            'epochs' : epochs,
            'optimizer': optimizer,
            'loss_weights': loss_weights,
            'dataset' : filename}

CV round 0
Start training the model DLR_0_mol_res_scan_results_7 ...
Evaluating model performance...
Train MSE: 0.738
Test MSE: 0.739
Start training the model PNN_0_mol_res_scan_results_7 ...
Evaluating model performance...
Train MSE: 0.082
Test MSE: 0.086
CV round 1
Start training the model DLR_1_mol_res_scan_results_7 ...
Evaluating model performance...
Train MSE: 0.747
Test MSE: 0.755
Start training the model PNN_1_mol_res_scan_results_7 ...
Evaluating model performance...
Train MSE: 0.023
Test MSE: 0.023
CV round 2
Start training the model DLR_2_mol_res_scan_results_7 ...
Evaluating model performance...
Train MSE: 0.734
Test MSE: 0.729
Start training the model PNN_2_mol_res_scan_results_7 ...
Evaluating model performance...
Train MSE: 0.041
Test MSE: 0.04
CV round 3
Start training the model DLR_3_mol_res_scan_results_7 ...
Evaluating model performance...
Train MSE: 0.721
Test MSE: 0.71
Start training the model PNN_3_mol_res_scan_results_7 ...
Evaluating model performance...
Train M