In [1]:
## Imports

# import python libraries  
import numpy as np
import pandas as pd
import sys
import os
import time
import pickle
import tensorflow as tf
import time
from importlib import reload
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import pearsonr
from tensorflow.keras.callbacks import LearningRateScheduler

from models.one_hot_drugs_models import build_CNN_Phos_FS_2 
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from scripts.data_wrangling import import_phos
from scripts.feature_selection import fs_functional_score, true_phosphosite_filter
from scripts.data_preparation import data_prep
from scripts.data_selection import data_indexing
from scripts.model_training import train_model, train_model_cv, train_model_ML, scheduler, split_all_scores

print('Imports: Done')
tf.keras.utils.set_random_seed(524) # 524, 270, 345

## Data Wrangling, feature selection and preparation

# Import and wrangle phosphoproteomics and GDSC1 drug data
phos_df, drug_df, drug_matrix, _all_cls, _all_drugs, common_ind  = import_phos()
# Create list of phosphosites filtered by feature selection method
phosphosites = fs_functional_score(phos_df,94)
# Reduce remaining features down by feature selection
phos_df = phos_df.filter(phosphosites,axis=1)
# Create dataframes for omics, drugs and target values
x_drug, x_all, y_series = data_prep(drug_df,phos_df,common_ind)
# import stored training and testing pairs 
with open (f'train_test_pairs/_train_test_pairs_20.ob', 'rb') as fp:
    train_test_list = pickle.load(fp)

## Training model and testing

all_scores = []
trials = len(train_test_list) # number of train/test splits tested 
epochs = 50 # max training epochs (50)

for i in range(trials):
    print(f"\nTrial {i+1}\n")
    
    # select train and test pairs
    train_pairs = train_test_list[i][0]
    test_pairs  = train_test_list[i][1]
    # index for each set of train-test pairs
    xo_train, xd_train, y_train, xo_test, xd_test, y_test = data_indexing(train_pairs,test_pairs,x_all,x_drug,y_series) # select train and test

    m_func = build_CNN_Phos_FS_2(xo_train,xd_train) # set model and parameters
    lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler) # set learning rate scheduler

    scores = train_model(m_func,lr_scheduler, 
                         x_train = [xo_train, xd_train], x_test = [xo_test, xd_test], 
                         y_train = y_train, y_test = y_test, 
                         epochs = epochs)
    
    all_scores.append(scores)
    
# split scores into individual lists
r2_scores, mse_scores, pearson_scores = split_all_scores(all_scores)

# find mean of all scores    
print('\nmean r2: ',np.mean(r2_scores))
print('\nr2 scores:')
print(*r2_scores, sep = '\n')

# find mean of all scores    
print('\nmean mse: ',np.mean(mse_scores))
print('\nr2 scores:')
print(*mse_scores, sep = '\n')

# find mean of all scores    
print('\nmean pearson: ',np.mean(pearson_scores))
print('\nr2 scores:')
print(*pearson_scores, sep = '\n')

2023-08-22 23:07:24.608806: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-22 23:07:24.611210: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-22 23:07:24.658256: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-22 23:07:24.659276: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Imports: Done
Data Imports and Wrangling: Done
cutoff percentile:  94
functional score cutoff:  0.5561424180240202

Trial 1

230.819 seconds
r2  score:  0.6854094339073435
mse score:  0.2857070693786897
-----

Trial 2

230.769 seconds
r2  score:  0.7885169800194393
mse score:  0.1940936300603443
-----

Trial 3

232.661 seconds
r2  score:  0.7456386936378459
mse score:  0.2625741486166483
-----

Trial 4

228.832 seconds
r2  score:  0.7327504930582178
mse score:  0.2700828238770411
-----

Trial 5

222.327 seconds
r2  score:  0.769898684465138
mse score:  0.23483494359092966
-----

Trial 6

224.892 seconds
r2  score:  0.7266795493119481
mse score:  0.2781314290908234
-----

Trial 7

227.448 seconds
r2  score:  0.7360291240936706
mse score:  0.26655594742927785
-----

Trial 8

225.784 seconds
r2  score:  0.7092538952033927
mse score:  0.28219493191102696
-----

Trial 9

229.430 seconds
r2  score:  0.7337417003970701
mse score:  0.2624120103688247
-----

Trial 10

223.230 seconds
r2  score: