In [1]:
# These are the ones that is already on the train script
import os
import json
import sys
import warnings
from pathlib import Path
from pprint import pformat
from typing import Dict, Union
import tensorflow as tf

import pickle
import pandas as pd
import numpy as np
from tensorflow.keras import backend as K
from New_data_generator_with_tf import DataGenerator, BootstrapGenerator, batch_predict

# [Req] IMPROVE imports
# notice that the improvelibs are in the folder that is a level above, but in the same parent directory
sys.path.append(os.path.abspath(os.path.join('..', 'IMPROVE')))
from improvelib.applications.drug_response_prediction.config import DRPTrainConfig
from improvelib.utils import str2bool
import improvelib.utils as frm
from improvelib.metrics import compute_metrics

# Model-specific imports
from model_params_def import train_params # [Req]

2025-04-21 11:38:59.050925: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-21 11:39:00.173899: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# any new additional packages necessary for the generator goes here
from tensorflow.keras.utils import Sequence
# need numpy here as well, but we already have this

In [3]:
training = False
dropout1 = 0.10
dropout2 = 0.20
## get the model architecture
def deepcdrgcn(dict_features, dict_adj_mat, samp_drug, samp_ach, cancer_dna_methy_model, cancer_gen_expr_model, cancer_gen_mut_model, training = training, dropout1 = dropout1, dropout2 = dropout2):
    
    input_gcn_features = tf.keras.layers.Input(shape = (dict_features[samp_drug].shape[0], 75))
    input_norm_adj_mat = tf.keras.layers.Input(shape = (dict_adj_mat[samp_drug].shape[0], dict_adj_mat[samp_drug].shape[0]))
    mult_1 = tf.keras.layers.Dot(1)([input_norm_adj_mat, input_gcn_features])
    dense_layer_gcn = tf.keras.layers.Dense(256, activation = "relu")
    dense_out = dense_layer_gcn(mult_1)
    dense_out = tf.keras.layers.BatchNormalization()(dense_out)
    dense_out = tf.keras.layers.Dropout(dropout1)(dense_out, training = training)
    mult_2 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
    dense_layer_gcn = tf.keras.layers.Dense(256, activation = "relu")
    dense_out = dense_layer_gcn(mult_2)
    dense_out = tf.keras.layers.BatchNormalization()(dense_out)
    dense_out = tf.keras.layers.Dropout(dropout1)(dense_out, training = training)

    dense_layer_gcn = tf.keras.layers.Dense(100, activation = "relu")
    mult_3 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
    dense_out = dense_layer_gcn(mult_3)
    dense_out = tf.keras.layers.BatchNormalization()(dense_out)
    dense_out = tf.keras.layers.Dropout(dropout1)(dense_out, training = training)

    dense_out = tf.keras.layers.GlobalAvgPool1D()(dense_out)
    # All above code is for GCN for drugs

    # methylation data
    input_gen_methy1 = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
    input_gen_methy = cancer_dna_methy_model(input_gen_methy1)
    input_gen_methy.trainable = False
    gen_methy_layer = tf.keras.layers.Dense(256, activation = "tanh")
    
    gen_methy_emb = gen_methy_layer(input_gen_methy)
    gen_methy_emb = tf.keras.layers.BatchNormalization()(gen_methy_emb)
    gen_methy_emb = tf.keras.layers.Dropout(dropout1)(gen_methy_emb, training = training)
    gen_methy_layer = tf.keras.layers.Dense(100, activation = "relu")
    gen_methy_emb = gen_methy_layer(gen_methy_emb)

    # gene expression data
    input_gen_expr1 = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
    input_gen_expr = cancer_gen_expr_model(input_gen_expr1)
    input_gen_expr.trainable = False
    gen_expr_layer = tf.keras.layers.Dense(256, activation = "tanh")
    
    gen_expr_emb = gen_expr_layer(input_gen_expr)
    gen_expr_emb = tf.keras.layers.BatchNormalization()(gen_expr_emb)
    gen_expr_emb = tf.keras.layers.Dropout(dropout1)(gen_expr_emb, training = training)
    gen_expr_layer = tf.keras.layers.Dense(100, activation = "relu")
    gen_expr_emb = gen_expr_layer(gen_expr_emb)
    
    
    input_gen_mut1 = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
    input_gen_mut = cancer_gen_mut_model(input_gen_mut1)
    input_gen_mut.trainable = False
    
    reshape_gen_mut = tf.keras.layers.Reshape((1, cancer_gen_mut_model(samp_ach).numpy().shape[0], 1))
    reshape_gen_mut = reshape_gen_mut(input_gen_mut)
    gen_mut_layer = tf.keras.layers.Conv2D(50, (1, 700), strides=5, activation = "tanh")
    gen_mut_emb = gen_mut_layer(reshape_gen_mut)
    pool_layer = tf.keras.layers.MaxPooling2D((1,5))
    pool_out = pool_layer(gen_mut_emb)
    gen_mut_layer = tf.keras.layers.Conv2D(30, (1, 5), strides=2, activation = "relu")
    gen_mut_emb = gen_mut_layer(pool_out)
    pool_layer = tf.keras.layers.MaxPooling2D((1,10))
    pool_out = pool_layer(gen_mut_emb)
    flatten_layer = tf.keras.layers.Flatten()
    flatten_out = flatten_layer(pool_out)
    x_mut = tf.keras.layers.Dense(100,activation = 'relu')(flatten_out)
    x_mut = tf.keras.layers.Dropout(dropout1)(x_mut)
    
    all_omics = tf.keras.layers.Concatenate()([dense_out, gen_methy_emb, gen_expr_emb, x_mut])
    x = tf.keras.layers.Dense(300,activation = 'tanh')(all_omics)
    x = tf.keras.layers.Dropout(dropout1)(x, training = training)
    x = tf.keras.layers.Lambda(lambda x: K.expand_dims(x,axis=-1))(x)
    x = tf.keras.layers.Lambda(lambda x: K.expand_dims(x,axis=1))(x)
    x = tf.keras.layers.Conv2D(filters=30, kernel_size=(1,150),strides=(1, 1), activation = 'relu',padding='valid')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(1,2))(x)
    x = tf.keras.layers.Conv2D(filters=10, kernel_size=(1,5),strides=(1, 1), activation = 'relu',padding='valid')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(1,3))(x)
    x = tf.keras.layers.Conv2D(filters=5, kernel_size=(1,5),strides=(1, 1), activation = 'relu',padding='valid')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(1,3))(x)
    x = tf.keras.layers.Dropout(dropout1)(x, training = training)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dropout(dropout2)(x, training = training)
    final_out_layer = tf.keras.layers.Dense(1, activation = "linear")
    final_out = final_out_layer(x)
    simplecdr = tf.keras.models.Model([input_gcn_features, input_norm_adj_mat, input_gen_expr1,
                                   input_gen_methy1, input_gen_mut1], final_out)
    
    return simplecdr

In [4]:
modelpath = frm.build_model_path(model_file_name="DeepCDR_model", model_file_format="", model_dir="exp_result")

In [5]:
modelpath # ntice that we are not using this anywhere in the code

PosixPath('exp_result/DeepCDR_model')

In [6]:
train_data_fname = frm.build_ml_data_file_name(data_format=".csv", stage="train")  # [Req]
val_data_fname = frm.build_ml_data_file_name(data_format=".csv", stage="val")  # [Req]

In [7]:
print(train_data_fname)
print(val_data_fname)

train_data.csv
val_data.csv


In [8]:
# specify the directory where preprocessed data is stored
data_dir = 'exp_result'

In [9]:
%%time
# load the models
cancer_gen_expr_model = tf.keras.models.load_model(os.path.join(data_dir,"cancer_gen_expr_model"))
cancer_gen_mut_model = tf.keras.models.load_model(os.path.join(data_dir, "cancer_gen_mut_model"))
cancer_dna_methy_model = tf.keras.models.load_model(os.path.join(data_dir, "cancer_dna_methy_model"))

2025-04-21 11:39:16.941794: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-21 11:39:18.979182: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30960 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:06:00.0, compute capability: 7.0














CPU times: user 1.17 s, sys: 840 ms, total: 2.01 s
Wall time: 4.19 s


In [10]:
cancer_gen_expr_model.trainable = False
cancer_gen_mut_model.trainable = False
cancer_dna_methy_model.trainable = False

In [11]:
with open(os.path.join(data_dir, "drug_features.pickle"),"rb") as f:
        dict_features = pickle.load(f)

In [12]:
with open(os.path.join(data_dir, "norm_adj_mat.pickle"),"rb") as f:
        dict_adj_mat = pickle.load(f)

In [13]:
train_keep = pd.read_csv(os.path.join(data_dir, "train_y_data.csv"))
valid_keep = pd.read_csv(os.path.join(data_dir, "val_y_data.csv"))

In [14]:
train_keep.head()

Unnamed: 0,improve_sample_id,improve_chem_id,auc
0,ACH-000956,Drug_749,0.7153
1,ACH-000956,Drug_1326,0.9579
2,ACH-000956,Drug_490,0.413
3,ACH-000956,Drug_558,0.8004
4,ACH-000956,Drug_195,0.5743


In [15]:
valid_keep.head()

Unnamed: 0,improve_sample_id,improve_chem_id,auc
0,ACH-000956,Drug_895,0.5071
1,ACH-000956,Drug_614,0.6525
2,ACH-000956,Drug_1040,0.8944
3,ACH-000323,Drug_1279,0.8691
4,ACH-000323,Drug_685,0.8527


In [16]:
train_keep.shape, valid_keep.shape

((7616, 3), (952, 3))

In [17]:
train_keep.columns = ["Cell_Line", "Drug_ID", "AUC"]
valid_keep.columns = ["Cell_Line", "Drug_ID", "AUC"]

In [18]:
train_keep.head()

Unnamed: 0,Cell_Line,Drug_ID,AUC
0,ACH-000956,Drug_749,0.7153
1,ACH-000956,Drug_1326,0.9579
2,ACH-000956,Drug_490,0.413
3,ACH-000956,Drug_558,0.8004
4,ACH-000956,Drug_195,0.5743


In [19]:
valid_keep.head()

Unnamed: 0,Cell_Line,Drug_ID,AUC
0,ACH-000956,Drug_895,0.5071
1,ACH-000956,Drug_614,0.6525
2,ACH-000956,Drug_1040,0.8944
3,ACH-000323,Drug_1279,0.8691
4,ACH-000323,Drug_685,0.8527


In [20]:
samp_drug = valid_keep["Drug_ID"].unique()[-1]
samp_ach = np.array(valid_keep["Cell_Line"].unique()[-1])

In [21]:
print(samp_drug)
print(samp_ach)

Drug_1326
ACH-000828


In [22]:
train_gcn_feats = []
train_adj_list = []
for drug_id in train_keep["Drug_ID"].values:
    train_gcn_feats.append(dict_features[drug_id])
    train_adj_list.append(dict_adj_mat[drug_id])

In [23]:
len(train_gcn_feats), len(train_adj_list)

(7616, 7616)

In [24]:
valid_gcn_feats = []
valid_adj_list = []
for drug_id in valid_keep["Drug_ID"].values:
    valid_gcn_feats.append(dict_features[drug_id])
    valid_adj_list.append(dict_adj_mat[drug_id])

In [25]:
len(valid_gcn_feats), len(valid_adj_list)

(952, 952)

In [26]:
len(valid_gcn_feats) + len(train_gcn_feats)

8568

In [27]:
%%time
# reduce the values to float16
train_gcn_feats = np.array(train_gcn_feats).astype("float32")
valid_gcn_feats = np.array(valid_gcn_feats).astype("float32")

train_adj_list = np.array(train_adj_list).astype("float32")
valid_adj_list = np.array(valid_adj_list).astype("float32")

CPU times: user 556 ms, sys: 792 ms, total: 1.35 s
Wall time: 1.36 s


In [28]:
train_gcn_feats.shape

(7616, 223, 75)

In [29]:
type(train_gcn_feats)

numpy.ndarray

In [30]:
train_adj_list.shape

(7616, 223, 223)

In [31]:
type(train_adj_list)

numpy.ndarray

In [32]:
train_keep["Cell_Line"].values.reshape(-1,1)

array([['ACH-000956'],
       ['ACH-000956'],
       ['ACH-000956'],
       ...,
       ['ACH-000828'],
       ['ACH-000828'],
       ['ACH-000828']], dtype=object)

In [33]:
train_keep["Cell_Line"].values.reshape(-1,1).shape

(7616, 1)

In [34]:
type(train_keep["Cell_Line"].values.reshape(-1,1))

numpy.ndarray

In [35]:
# valid_keep["Cell_Line"].values.reshape(-1,1)

In [36]:
valid_keep["Cell_Line"].values.reshape(-1,1).shape

(952, 1)

In [37]:
valid_keep["AUC"].shape

(952,)

In [38]:
valid_keep["AUC"].values.reshape(-1,1).shape

(952, 1)

In [39]:
type(valid_keep["AUC"].values.reshape(-1,1))

numpy.ndarray

In [40]:
batch_size = 32

In [41]:
# Define the two regular data generators

In [42]:
train_gen_alt = DataGenerator(train_gcn_feats, train_adj_list, train_keep["Cell_Line"].values.reshape(-1,1), train_keep["Cell_Line"].values.reshape(-1,1), train_keep["Cell_Line"].values.reshape(-1,1), train_keep["AUC"].values.reshape(-1,1), batch_size=32)

In [43]:
val_gen_alt = DataGenerator(valid_gcn_feats, valid_adj_list, valid_keep["Cell_Line"].values.reshape(-1,1), valid_keep["Cell_Line"].values.reshape(-1,1), valid_keep["Cell_Line"].values.reshape(-1,1), valid_keep["AUC"].values.reshape(-1,1), batch_size=32)

In [44]:
# print the contents

In [45]:
# cat = 0
# for X, y in train_gen_alt:
#     print(X[0].shape, X[1].shape, X[2].shape, X[3].shape, X[4].shape, y.shape)
#     cat = cat + 1
#     print(cat)

In [46]:
cat = 0
for X, y in train_gen_alt:
    print(np.mean(X[0]), np.mean(X[1]), X[2].shape, X[3].shape, X[4].shape, y.shape)
    cat = cat + 1
    print(cat)

0.012001121 0.0044764034 (32, 1) (32, 1) (32, 1) (32, 1)
1
0.011930119 0.004476615 (32, 1) (32, 1) (32, 1) (32, 1)
2
0.012090807 0.004476463 (32, 1) (32, 1) (32, 1) (32, 1)
3
0.011289238 0.0044771484 (32, 1) (32, 1) (32, 1) (32, 1)
4
0.011479821 0.004476366 (32, 1) (32, 1) (32, 1) (32, 1)
5
0.012019806 0.0044755433 (32, 1) (32, 1) (32, 1) (32, 1)
6
0.0117077725 0.0044768755 (32, 1) (32, 1) (32, 1) (32, 1)
7
0.0107754115 0.0044774287 (32, 1) (32, 1) (32, 1) (32, 1)
8
0.011618087 0.004476782 (32, 1) (32, 1) (32, 1) (32, 1)
9
0.011334081 0.0044768294 (32, 1) (32, 1) (32, 1) (32, 1)
10
0.011804933 0.0044759726 (32, 1) (32, 1) (32, 1) (32, 1)
11
0.011726458 0.0044758995 (32, 1) (32, 1) (32, 1) (32, 1)
12
0.011061286 0.004476847 (32, 1) (32, 1) (32, 1) (32, 1)
13
0.012552317 0.0044754725 (32, 1) (32, 1) (32, 1) (32, 1)
14
0.011905829 0.0044758655 (32, 1) (32, 1) (32, 1) (32, 1)
15
0.011761958 0.0044763773 (32, 1) (32, 1) (32, 1) (32, 1)
16
0.012415919 0.004475461 (32, 1) (32, 1) (32, 1) (32,

In [47]:
238*32

7616

In [48]:
# # Similarly study the validation data
# cat = 0
# for X, y in val_gen_alt:
#     print(np.mean(X[0]), np.mean(X[1]), X[2].shape, X[3].shape, X[4].shape, y.shape)
#     cat = cat + 1
#     print(cat)

In [49]:
29*32+24

952

In [50]:
# Okay, let's define the bootstrap generator here for the train data. Notice that, the validation generator will remain the same for bootstrap training as well.

In [51]:
train_gen_bootstrap = BootstrapGenerator(train_gcn_feats, train_adj_list, train_keep["Cell_Line"].values.reshape(-1,1), train_keep["Cell_Line"].values.reshape(-1,1), train_keep["Cell_Line"].values.reshape(-1,1), train_keep["AUC"].values.reshape(-1,1), batch_size=32)

In [52]:
# Generate the bootstrap data, we just need to ensure the values are different between this and the train data generator
cat = 0
for X, y in train_gen_bootstrap:
    print(np.mean(X[0]), np.mean(X[1]), X[2].shape, X[3].shape, X[4].shape, y.shape)
    cat = cat + 1
    print(cat)

0.011459268 0.004476404 (32, 1) (32, 1) (32, 1) (32, 1)
1
0.011283632 0.0044769906 (32, 1) (32, 1) (32, 1) (32, 1)
2
0.011956278 0.0044762567 (32, 1) (32, 1) (32, 1) (32, 1)
3
0.011410688 0.0044767237 (32, 1) (32, 1) (32, 1) (32, 1)
4
0.011797459 0.004476289 (32, 1) (32, 1) (32, 1) (32, 1)
5
0.011784379 0.004475936 (32, 1) (32, 1) (32, 1) (32, 1)
6
0.011535875 0.0044764257 (32, 1) (32, 1) (32, 1) (32, 1)
7
0.011866592 0.0044768755 (32, 1) (32, 1) (32, 1) (32, 1)
8
0.011505979 0.0044764993 (32, 1) (32, 1) (32, 1) (32, 1)
9
0.011491031 0.004476044 (32, 1) (32, 1) (32, 1) (32, 1)
10
0.011442452 0.004476693 (32, 1) (32, 1) (32, 1) (32, 1)
11
0.010928625 0.004477154 (32, 1) (32, 1) (32, 1) (32, 1)
12
0.012169283 0.0044759833 (32, 1) (32, 1) (32, 1) (32, 1)
13
0.011513453 0.004476108 (32, 1) (32, 1) (32, 1) (32, 1)
14
0.011961884 0.004475937 (32, 1) (32, 1) (32, 1) (32, 1)
15
0.011950673 0.004476861 (32, 1) (32, 1) (32, 1) (32, 1)
16
0.0114144245 0.0044765053 (32, 1) (32, 1) (32, 1) (32, 1)


In [53]:
238*32

7616

In [54]:
# Seems like it is working and we are ready for the implementation - And also note that we ran the cell that examines the bootrap generator a few times and we ended up getting different reuslts in those multiple runs. We can add a random seed if it is needed to reproduce the results, but I do not think it is necessary at the moment.

In [55]:
# I think we can figure out the gist of what needs to happen inorder to get the model to train multiple times and generate the results and store the trained models here before we get it on the train script - Once we have the train script we might also need to make it improve compliant inroder to make sure we use their developed code.

In [56]:
# Okay, before moving there, should we add the bootstrap generator class also to the same python script that has the original data generator class and the batch prediction functions? I think that will be better.

In [57]:
# Continue this work in a separate notebook