In [None]:
## Imports

# import python libraries  
import numpy as np
import pandas as pd
import sys
import os
import time
import pickle
import tensorflow as tf
import time
from importlib import reload
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import pearsonr
from tensorflow.keras.callbacks import LearningRateScheduler

from models.one_hot_drugs_models import build_Phos_Prot_CNN 
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from scripts.data_wrangling import import_phos
from scripts.feature_selection import fs_landmark, fs_atlas_landmark, true_phosphosite_filter
from scripts.data_preparation import data_prep
from scripts.data_selection import data_indexing
from scripts.model_training import train_model_multi, prediction_metrics, scheduler, split_all_scores

print('Imports: Done')
tf.keras.utils.set_random_seed(524) # 524, 270, 345

## Data Wrangling, feature selection and preparation

# Import and wrangle phosphoproteomics, proteomics and GDSC1 drug data
phos_df, prot_df, drug_df, drug_matrix, _all_cls, _all_drugs, common_ind = import_all()

# create list of phosphosites filtered by feature selection method
phosphosites = fs_atlas_landmark(phos_df,80)
# remove false positive phosphosites
filtered_phosphosites = true_phosphosite_filter(phosphosites)
# reduce remaining features down by feature selection
phos_df = phos_df.filter(filtered_phosphosites,axis=1)

# create list of landmark gene proteins
landmark_proteins = fs_landmark(prot_df)
# reduce remaining features down by feature selection
prot_df = prot_df.filter(landmark_proteins,axis=1)

## Data Preparation & Selection
x_drug, x_all_phos, y_series = data_prep(drug_df,phos_df,common_ind)
x_drug, x_all_prot, y_series = data_prep(drug_df,prot_df,common_ind)
# import stored training and testing pairs 
with open (f'train_test_pairs/_train_test_pairs_1.ob', 'rb') as fp:
    train_test_list = pickle.load(fp)

## Training model and testing

all_scores = []
trials = len(train_test_list) # number of train/test splits tested 
epochs = 60 # max training epochs (60)

for i in range(trials):
    print(f"\nTrial {i+1}\n")
    
    # select train and test pairs
    train_pairs = train_test_list[i][0]
    test_pairs  = train_test_list[i][1]
    # index for each set of train-test pairs
    xo_train_phos, xd_train, y_train, xo_test_phos, xd_test, y_test = data_indexing(train_pairs,test_pairs,x_all_phos,x_drug,y_series) 
    xo_train_prot, xd_train, y_train, xo_test_prot, xd_test, y_test = data_indexing(train_pairs,test_pairs,x_all_prot,x_drug,y_series)

    m_func = build_Phos_Prot_CNN(x_all_phos,x_all_prot,x_drug) # set model and parameters
    lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler) # set learning rate scheduler

    scores = train_model_multi(m_func,lr_scheduler, 
                               x_train = [xo_train_phos, xo_train_prot, xd_train], 
                               x_test = [xo_test_phos, xo_test_prot, xd_test], 
                               y_train = y_train, y_test = y_test, 
                               epochs = epochs)
            
    all_scores.append(scores)
    
# split scores into individual lists
r2_scores, mse_scores, pearson_scores = split_all_scores(all_scores)

# find mean of all scores    
print('\nmean r2: ',np.mean(r2_scores))
print('\nr2 scores:')
print(*r2_scores, sep = '\n')

# find mean of all scores    
print('\nmean mse: ',np.mean(mse_scores))
print('\nr2 scores:')
print(*mse_scores, sep = '\n')

# find mean of all scores    
print('\nmean pearson: ',np.mean(pearson_scores))
print('\nr2 scores:')
print(*pearson_scores, sep = '\n')