In [1]:
# These are the ones that is already on the train script
import os
import json
import sys
import warnings
from pathlib import Path
from pprint import pformat
from typing import Dict, Union
import tensorflow as tf

import pickle
import pandas as pd
import numpy as np
from tensorflow.keras import backend as K
# generator related imports
from New_data_generator_with_tf import DataGenerator, BootstrapGenerator, batch_predict
from tensorflow.keras.utils import Sequence

# [Req] IMPROVE imports
# notice that the improvelibs are in the folder that is a level above, but in the same parent directory
sys.path.append(os.path.abspath(os.path.join('..', 'IMPROVE')))
from improvelib.applications.drug_response_prediction.config import DRPTrainConfig
from improvelib.utils import str2bool
import improvelib.utils as frm
from improvelib.metrics import compute_metrics

# Model-specific imports
from model_params_def import train_params # [Req]

2025-04-21 12:39:06.947421: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-21 12:39:07.967066: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
training = False
dropout1 = 0.10
dropout2 = 0.20
## get the model architecture
def deepcdrgcn(dict_features, dict_adj_mat, samp_drug, samp_ach, cancer_dna_methy_model, cancer_gen_expr_model, cancer_gen_mut_model, training = training, dropout1 = dropout1, dropout2 = dropout2):
    
    input_gcn_features = tf.keras.layers.Input(shape = (dict_features[samp_drug].shape[0], 75))
    input_norm_adj_mat = tf.keras.layers.Input(shape = (dict_adj_mat[samp_drug].shape[0], dict_adj_mat[samp_drug].shape[0]))
    mult_1 = tf.keras.layers.Dot(1)([input_norm_adj_mat, input_gcn_features])
    dense_layer_gcn = tf.keras.layers.Dense(256, activation = "relu")
    dense_out = dense_layer_gcn(mult_1)
    dense_out = tf.keras.layers.BatchNormalization()(dense_out)
    dense_out = tf.keras.layers.Dropout(dropout1)(dense_out, training = training)
    mult_2 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
    dense_layer_gcn = tf.keras.layers.Dense(256, activation = "relu")
    dense_out = dense_layer_gcn(mult_2)
    dense_out = tf.keras.layers.BatchNormalization()(dense_out)
    dense_out = tf.keras.layers.Dropout(dropout1)(dense_out, training = training)

    dense_layer_gcn = tf.keras.layers.Dense(100, activation = "relu")
    mult_3 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
    dense_out = dense_layer_gcn(mult_3)
    dense_out = tf.keras.layers.BatchNormalization()(dense_out)
    dense_out = tf.keras.layers.Dropout(dropout1)(dense_out, training = training)

    dense_out = tf.keras.layers.GlobalAvgPool1D()(dense_out)
    # All above code is for GCN for drugs

    # methylation data
    input_gen_methy1 = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
    input_gen_methy = cancer_dna_methy_model(input_gen_methy1)
    input_gen_methy.trainable = False
    gen_methy_layer = tf.keras.layers.Dense(256, activation = "tanh")
    
    gen_methy_emb = gen_methy_layer(input_gen_methy)
    gen_methy_emb = tf.keras.layers.BatchNormalization()(gen_methy_emb)
    gen_methy_emb = tf.keras.layers.Dropout(dropout1)(gen_methy_emb, training = training)
    gen_methy_layer = tf.keras.layers.Dense(100, activation = "relu")
    gen_methy_emb = gen_methy_layer(gen_methy_emb)

    # gene expression data
    input_gen_expr1 = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
    input_gen_expr = cancer_gen_expr_model(input_gen_expr1)
    input_gen_expr.trainable = False
    gen_expr_layer = tf.keras.layers.Dense(256, activation = "tanh")
    
    gen_expr_emb = gen_expr_layer(input_gen_expr)
    gen_expr_emb = tf.keras.layers.BatchNormalization()(gen_expr_emb)
    gen_expr_emb = tf.keras.layers.Dropout(dropout1)(gen_expr_emb, training = training)
    gen_expr_layer = tf.keras.layers.Dense(100, activation = "relu")
    gen_expr_emb = gen_expr_layer(gen_expr_emb)
    
    
    input_gen_mut1 = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
    input_gen_mut = cancer_gen_mut_model(input_gen_mut1)
    input_gen_mut.trainable = False
    
    reshape_gen_mut = tf.keras.layers.Reshape((1, cancer_gen_mut_model(samp_ach).numpy().shape[0], 1))
    reshape_gen_mut = reshape_gen_mut(input_gen_mut)
    gen_mut_layer = tf.keras.layers.Conv2D(50, (1, 700), strides=5, activation = "tanh")
    gen_mut_emb = gen_mut_layer(reshape_gen_mut)
    pool_layer = tf.keras.layers.MaxPooling2D((1,5))
    pool_out = pool_layer(gen_mut_emb)
    gen_mut_layer = tf.keras.layers.Conv2D(30, (1, 5), strides=2, activation = "relu")
    gen_mut_emb = gen_mut_layer(pool_out)
    pool_layer = tf.keras.layers.MaxPooling2D((1,10))
    pool_out = pool_layer(gen_mut_emb)
    flatten_layer = tf.keras.layers.Flatten()
    flatten_out = flatten_layer(pool_out)
    x_mut = tf.keras.layers.Dense(100,activation = 'relu')(flatten_out)
    x_mut = tf.keras.layers.Dropout(dropout1)(x_mut)
    
    all_omics = tf.keras.layers.Concatenate()([dense_out, gen_methy_emb, gen_expr_emb, x_mut])
    x = tf.keras.layers.Dense(300,activation = 'tanh')(all_omics)
    x = tf.keras.layers.Dropout(dropout1)(x, training = training)
    x = tf.keras.layers.Lambda(lambda x: K.expand_dims(x,axis=-1))(x)
    x = tf.keras.layers.Lambda(lambda x: K.expand_dims(x,axis=1))(x)
    x = tf.keras.layers.Conv2D(filters=30, kernel_size=(1,150),strides=(1, 1), activation = 'relu',padding='valid')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(1,2))(x)
    x = tf.keras.layers.Conv2D(filters=10, kernel_size=(1,5),strides=(1, 1), activation = 'relu',padding='valid')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(1,3))(x)
    x = tf.keras.layers.Conv2D(filters=5, kernel_size=(1,5),strides=(1, 1), activation = 'relu',padding='valid')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(1,3))(x)
    x = tf.keras.layers.Dropout(dropout1)(x, training = training)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dropout(dropout2)(x, training = training)
    final_out_layer = tf.keras.layers.Dense(1, activation = "linear")
    final_out = final_out_layer(x)
    simplecdr = tf.keras.models.Model([input_gcn_features, input_norm_adj_mat, input_gen_expr1,
                                   input_gen_methy1, input_gen_mut1], final_out)
    
    return simplecdr

In [3]:
# specify the directory where preprocessed data is stored
data_dir = 'exp_result'

In [4]:
%%time
# load the models
cancer_gen_expr_model = tf.keras.models.load_model(os.path.join(data_dir,"cancer_gen_expr_model"))
cancer_gen_mut_model = tf.keras.models.load_model(os.path.join(data_dir, "cancer_gen_mut_model"))
cancer_dna_methy_model = tf.keras.models.load_model(os.path.join(data_dir, "cancer_dna_methy_model"))

2025-04-21 12:39:23.829398: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-21 12:39:25.576670: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30960 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:06:00.0, compute capability: 7.0














CPU times: user 1.09 s, sys: 764 ms, total: 1.85 s
Wall time: 3.61 s


In [5]:
cancer_gen_expr_model.trainable = False
cancer_gen_mut_model.trainable = False
cancer_dna_methy_model.trainable = False

In [6]:
with open(os.path.join(data_dir, "drug_features.pickle"),"rb") as f:
        dict_features = pickle.load(f)

In [7]:
with open(os.path.join(data_dir, "norm_adj_mat.pickle"),"rb") as f:
        dict_adj_mat = pickle.load(f)

In [8]:
train_keep = pd.read_csv(os.path.join(data_dir, "train_y_data.csv"))
valid_keep = pd.read_csv(os.path.join(data_dir, "val_y_data.csv"))

In [9]:
train_keep.head()

Unnamed: 0,improve_sample_id,improve_chem_id,auc
0,ACH-000956,Drug_749,0.7153
1,ACH-000956,Drug_1326,0.9579
2,ACH-000956,Drug_490,0.413
3,ACH-000956,Drug_558,0.8004
4,ACH-000956,Drug_195,0.5743


In [10]:
valid_keep.head()

Unnamed: 0,improve_sample_id,improve_chem_id,auc
0,ACH-000956,Drug_895,0.5071
1,ACH-000956,Drug_614,0.6525
2,ACH-000956,Drug_1040,0.8944
3,ACH-000323,Drug_1279,0.8691
4,ACH-000323,Drug_685,0.8527


In [11]:
train_keep.shape, valid_keep.shape

((7616, 3), (952, 3))

In [12]:
train_keep.columns = ["Cell_Line", "Drug_ID", "AUC"]
valid_keep.columns = ["Cell_Line", "Drug_ID", "AUC"]

In [13]:
train_keep.head()

Unnamed: 0,Cell_Line,Drug_ID,AUC
0,ACH-000956,Drug_749,0.7153
1,ACH-000956,Drug_1326,0.9579
2,ACH-000956,Drug_490,0.413
3,ACH-000956,Drug_558,0.8004
4,ACH-000956,Drug_195,0.5743


In [14]:
valid_keep.head()

Unnamed: 0,Cell_Line,Drug_ID,AUC
0,ACH-000956,Drug_895,0.5071
1,ACH-000956,Drug_614,0.6525
2,ACH-000956,Drug_1040,0.8944
3,ACH-000323,Drug_1279,0.8691
4,ACH-000323,Drug_685,0.8527


In [15]:
samp_drug = valid_keep["Drug_ID"].unique()[-1]
samp_ach = np.array(valid_keep["Cell_Line"].unique()[-1])

In [16]:
print(samp_drug)
print(samp_ach)

Drug_1326
ACH-000828


In [17]:
train_gcn_feats = []
train_adj_list = []
for drug_id in train_keep["Drug_ID"].values:
    train_gcn_feats.append(dict_features[drug_id])
    train_adj_list.append(dict_adj_mat[drug_id])

In [18]:
len(train_gcn_feats), len(train_adj_list)

(7616, 7616)

In [19]:
valid_gcn_feats = []
valid_adj_list = []
for drug_id in valid_keep["Drug_ID"].values:
    valid_gcn_feats.append(dict_features[drug_id])
    valid_adj_list.append(dict_adj_mat[drug_id])

In [20]:
len(valid_gcn_feats), len(valid_adj_list)

(952, 952)

In [21]:
len(valid_gcn_feats) + len(train_gcn_feats)

8568

In [22]:
%%time
# reduce the values to float16
train_gcn_feats = np.array(train_gcn_feats).astype("float32")
valid_gcn_feats = np.array(valid_gcn_feats).astype("float32")

train_adj_list = np.array(train_adj_list).astype("float32")
valid_adj_list = np.array(valid_adj_list).astype("float32")

CPU times: user 528 ms, sys: 785 ms, total: 1.31 s
Wall time: 1.32 s


In [23]:
train_gcn_feats.shape

(7616, 223, 75)

In [24]:
type(train_gcn_feats)

numpy.ndarray

In [25]:
train_adj_list.shape

(7616, 223, 223)

In [26]:
type(train_adj_list)

numpy.ndarray

In [27]:
train_keep["Cell_Line"].values.reshape(-1,1)

array([['ACH-000956'],
       ['ACH-000956'],
       ['ACH-000956'],
       ...,
       ['ACH-000828'],
       ['ACH-000828'],
       ['ACH-000828']], dtype=object)

In [28]:
train_keep["Cell_Line"].values.reshape(-1,1).shape

(7616, 1)

In [29]:
type(train_keep["Cell_Line"].values.reshape(-1,1))

numpy.ndarray

In [30]:
# valid_keep["Cell_Line"].values.reshape(-1,1)

In [31]:
valid_keep["Cell_Line"].values.reshape(-1,1).shape

(952, 1)

In [32]:
valid_keep["AUC"].shape

(952,)

In [33]:
valid_keep["AUC"].values.reshape(-1,1).shape

(952, 1)

In [34]:
type(valid_keep["AUC"].values.reshape(-1,1))

numpy.ndarray

In [35]:
batch_size = 32

In [36]:
# Okay, now what needs to happen for the bootstrap training? We have some steps both outside the for loop, and inside the for loop. Let's first define the ones outside the for loop

In [37]:
# What happens outside of the for loop?

In [38]:
# We can define the validation data generator
val_gen_bootstraps = DataGenerator(valid_gcn_feats, valid_adj_list, valid_keep["Cell_Line"].values.reshape(-1,1), valid_keep["Cell_Line"].values.reshape(-1,1), valid_keep["Cell_Line"].values.reshape(-1,1), valid_keep["AUC"].values.reshape(-1,1), batch_size=32,  shuffle = False)

In [39]:
# make a folder to store all the results - we might want to carefully consider where we need these results be stored.
# For the improve compliant code, this might need to be inside the 'exp_result' folder itself. For now, we will create a random folder.

In [40]:
folder_name = 'bootstrap_results_all'

if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder: '{folder_name}' created successfully")
else:
    print(f"Folder: '{folder_name} already exists!'")

Folder: 'bootstrap_results_all already exists!'


In [41]:
# Can we now move onto the inside of the for loop?
B = 10
for i in range(1, B):
    
    # define the model
    training = False
    dropout1 = 0.10
    dropout2 = 0.20
    check = deepcdrgcn(dict_features, dict_adj_mat, samp_drug, samp_ach, cancer_dna_methy_model, cancer_gen_expr_model, cancer_gen_mut_model,  training = training,
                       dropout1 = dropout1, dropout2 = dropout2)
    # check.summary()
    
    # define the data generator
    train_gen_bootstrap = BootstrapGenerator(train_gcn_feats, train_adj_list, train_keep["Cell_Line"].values.reshape(-1,1),
                                             train_keep["Cell_Line"].values.reshape(-1,1), train_keep["Cell_Line"].values.reshape(-1,1),
                                             train_keep["AUC"].values.reshape(-1,1), batch_size=32)
    # compile the model
    lr = 0.001
    check.compile(loss = tf.keras.losses.MeanSquaredError(), optimizer = tf.keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=False), 
                  metrics = [tf.keras.metrics.RootMeanSquaredError()])

    # # fit the model
    # epoch_num = 150
    # patience_val = 10
    # batch_size = 32
    # generator_batch_size = 32
    # check.fit(train_gen_bootstrap, validation_data = val_gen_bootstraps, epochs = epoch_num, batch_size = batch_size,
    #      callbacks = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = patience_val, restore_best_weights=True, 
    #                                                  mode = "min") ,validation_batch_size = generator_batch_size)
    
    # for now, let's load an already existing model, to get the rest of the code going
    model = tf.keras.models.load_model('exp_result/DeepCDR_model/DeepCDR_model')
    # use this model and do the batch predictions

    # first store this model in the respective folders
    # create another folder with indexing of the bootstraps
    folder_path_bootstraps = 'bootstrap_results_all'
    new_folder = 'bootstrap_' + str(i)
    # join the two paths
    folder_loc = os.path.join(folder_path_bootstraps, new_folder)
    # create a folder - or ignore if the already exists
    if not os.path.exists(folder_loc):
        os.makedirs(folder_loc)
        print(f"Folder: '{new_folder}' created successfully")
    else:
        print(f"Folder: '{new_folder} already exists!'")
    # # save the model
    # model.save(os.path.join(folder_loc, "DeepCDR_model"))
    
    # evaluate the model
    y_val_preds, y_val_true = batch_predict(model, val_gen_bootstraps)
    y_train_preds, y_train_true = batch_predict(model, train_gen_bootstrap)
    
    # save the results - this might be the hard part
    # save the validation results
    frm.store_predictions_df(
        y_true=y_val_true, 
        y_pred=y_val_preds, 
        stage="val",
        y_col_name="auc",
        output_dir=folder_loc,
        input_dir='exp_result')

    val_scores = frm.compute_performance_scores(
        y_true=y_val_true, 
        y_pred=y_val_preds, 
        stage="val",
        metric_type='regression',
        output_dir= folder_loc)

    # cannot save the train data as there is an assertion, so we might as well get the predictions on the train data in the train NNe script as else there will be too much work

Folder: 'bootstrap_1 already exists!'


2025-04-21 12:39:35.713223: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401


Predictions: 952
True: 952
Predictions: 7616
True: 7616

IMPROVE_RESULT val_loss:	0.005555138690445072

Validation scores:
	{'mse': 0.005555138690445072, 'rmse': 0.07453280278135978, 'pcc': 0.8811951373775181, 'scc': 0.7612795838721879, 'r2': 0.7670492353399421}
Folder: 'bootstrap_2 already exists!'
Predictions: 952
True: 952
Predictions: 7616
True: 7616

IMPROVE_RESULT val_loss:	0.005555138690445072

Validation scores:
	{'mse': 0.005555138690445072, 'rmse': 0.07453280278135978, 'pcc': 0.8811951373775181, 'scc': 0.7612795838721879, 'r2': 0.7670492353399421}
Folder: 'bootstrap_3 already exists!'
Predictions: 952
True: 952
Predictions: 7616
True: 7616

IMPROVE_RESULT val_loss:	0.005555138690445072

Validation scores:
	{'mse': 0.005555138690445072, 'rmse': 0.07453280278135978, 'pcc': 0.8811951373775181, 'scc': 0.7612795838721879, 'r2': 0.7670492353399421}
Folder: 'bootstrap_4 already exists!'
Predictions: 952
True: 952
Predictions: 7616
True: 7616

IMPROVE_RESULT val_loss:	0.0055551386904

Okay, now implement this in a python script with proper training of the model.