In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import keras.backend as K
import scipy.sparse as sp
import random
from sklearn.preprocessing import StandardScaler

2024-07-12 09:12:16.517687: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-12 09:12:16.789280: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 09:12:16.789337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 09:12:16.844685: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-12 09:12:16.929244: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# load the trained model
model = tf.keras.models.load_model("..//models//deepcdr_trained_on_domain")

2024-07-12 09:12:31.127314: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31141 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:86:00.0, compute capability: 7.0


In [3]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 100, 100)]           0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 100, 75)]            0         []                            
                                                                                                  
 dot (Dot)                   (None, 100, 75)              0         ['input_2[0][0]',             
                                                                     'input_1[0][0]']             
                                                                                                  
 dense (Dense)               (None, 100, 256)             19456     ['dot[0][0]']             

In [4]:
# tf.keras.utils.plot_model(model, show_shapes = True)

In [5]:
drug_output = model.get_layer("global_average_pooling1d").output

In [6]:
drug_output.shape

TensorShape([None, 100])

In [7]:
gene_expr_output = model.get_layer("dense_4").output
gene_expr_output.shape

TensorShape([None, 100])

In [8]:
methyl_out = model.get_layer("dense_7").output
methyl_out.shape

TensorShape([None, 300])

In [9]:
mutation_out = model.get_layer("flatten").output
mutation_out.shape

TensorShape([None, 2010])

In [10]:
extract_model = tf.keras.models.Model(model.input, [gene_expr_output, methyl_out, mutation_out, drug_output])

In [11]:
# Import and prep the data

In [12]:
# import the response data
drugs_cell_lines_ic50_df = pd.read_csv("..//data/drugs_cell_lines_ic50.csv") 
print(drugs_cell_lines_ic50_df.shape)
# import the drug smiles data
pubchem_drugs_smiles_df = pd.read_csv('..//data/drugs_smile_strings.csv')
print(pubchem_drugs_smiles_df.shape)

drugs_smiles_cell_lines_ic50_df = pd.merge(drugs_cell_lines_ic50_df, pubchem_drugs_smiles_df, 
                                             on = "drug_id")
drugs_smiles_cell_lines_ic50_df = drugs_smiles_cell_lines_ic50_df[["drug_id", "Cancer_Cell_Line", "Smiles", "IC50"]]
drugs_smiles_cell_lines_ic50_df.dtypes
drugs_smiles_cell_lines_ic50_df["drug_id"] = drugs_smiles_cell_lines_ic50_df["drug_id"].astype(object)


(208734, 3)
(238, 2)


In [13]:
drugs_smiles_cell_lines_ic50_df.shape

(186789, 4)

In [14]:
drugs_smiles_cell_lines_ic50_df.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,Smiles,IC50
0,1001,ACH-002137,NC(=O)c1ncn(C2OC(COP(=O)(O)O)C(O)C2O)c1N,7.258918
1,1004,ACH-002137,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,-3.802467
2,1005,ACH-002137,N.N.[Cl-].[Cl-].[Pt+2],4.146364
3,1006,ACH-002137,Nc1ccn(C2OC(CO)C(O)C2O)c(=O)n1,3.171367
4,1007,ACH-002137,CC(=O)OC12COC1CC(O)C1(C)C(=O)C(O)C3=C(C)C(OC(=...,-4.959442


In [15]:
import pickle

# Get drug features and adjacency information
with open("..//data/drug_gcn_features.pickle", "rb") as f:
    dict_features = pickle.load(f)

with open("..//data/drug_gcn_normalized_adj_mats.pickle", "rb") as f:
    dict_normalized_adj_mats = pickle.load(f)

In [16]:
dualgcn_train = pd.read_csv("..//data/DualGCN_Embedding_train.csv")
dualgcn_test = pd.read_csv("..//data/DualGCN_Embedding_test.csv")
pubchem_to_drugs_df = pd.read_csv('..//data/1.Drug_listMon Jun 24 09_00_55 2019.csv')
pubchem_to_drugs_df = pubchem_to_drugs_df[["drug_id", "PubCHEM"]]
pubchem_to_drugs_df["PubCHEM"] = [val if str(val).isdigit() else np.nan for val in pubchem_to_drugs_df["PubCHEM"] ]
pubchem_to_drugs_df = pubchem_to_drugs_df.dropna()
pubchem_to_drugs_df["drug_id"] = pubchem_to_drugs_df["drug_id"].astype(str)
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(drugs_smiles_cell_lines_ic50_df.drop(["IC50"],axis = 1), drugs_smiles_cell_lines_ic50_df["IC50"].values, 
                                                     test_size = 0.20, random_state = 42)

In [17]:
# dualgcn_train

In [18]:
dualgcn_train["Drug_ID"] = dualgcn_train["Drug_ID"].astype(str)
dualgcn_test["Drug_ID"] = dualgcn_test["Drug_ID"].astype(str)
dualgcn_train = pubchem_to_drugs_df.merge(dualgcn_train, left_on = ["PubCHEM"], right_on = ["Drug_ID"])
np.mean(dualgcn_train['PubCHEM'] == dualgcn_train['Drug_ID'])
dualgcn_train = dualgcn_train[['Cell_Line', 'drug_id']]
dualgcn_test = pubchem_to_drugs_df.merge(dualgcn_test, left_on = ["PubCHEM"], right_on = ["Drug_ID"])
dualgcn_test = dualgcn_test[['Cell_Line', 'drug_id']]
x_train['drug_id'] = x_train['drug_id'].astype(str)
x_valid['drug_id'] = x_valid['drug_id'].astype(str)
x_train_valid_feats = pd.concat([x_train, x_valid], ignore_index = True)
y_train_valid = pd.concat([pd.DataFrame(y_train.reshape(-1,1)), pd.DataFrame(y_valid.reshape(-1,1))], ignore_index = True)
combo_train_valid = pd.concat([x_train_valid_feats, y_train_valid], axis = 1)
combo_train_valid.columns = ['drug_id', 'Cancer_Cell_Line', 'Smiles', 'IC50']
# filter x_train x _valid here
x_y_train = combo_train_valid.merge(dualgcn_train, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])
x_y_test = combo_train_valid.merge(dualgcn_test, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])

x_train, x_valid, y_train, y_valid = x_y_train.drop(["IC50", 'Cell_Line'],axis = 1), x_y_test.drop(["IC50", 'Cell_Line'], axis = 1), x_y_train["IC50"].values, x_y_test["IC50"].values

train_gcn_feats = []
train_adj_list = []
for drug_id in x_train["drug_id"].values:
    train_gcn_feats.append(dict_features[drug_id])
    train_adj_list.append(dict_normalized_adj_mats[drug_id])

valid_gcn_feats = []
valid_adj_list = []
for drug_id in x_valid["drug_id"].values:
    valid_gcn_feats.append(dict_features[drug_id])
    valid_adj_list.append(dict_normalized_adj_mats[drug_id])

train_gcn_feats = np.array(train_gcn_feats).astype("float32")
valid_gcn_feats = np.array(valid_gcn_feats).astype("float32")

train_adj_list = np.array(train_adj_list).astype("float32")
valid_adj_list = np.array(valid_adj_list).astype("float32")


In [19]:
cancer_copy_number_model = tf.keras.models.load_model("..//models//cancer_copy_number_model_no_norm_common")
cancer_cell_gen_expr_model = tf.keras.models.load_model("..//models//cancer_cell_gen_expr_model_no_norm_common")
cancer_cell_gen_methy_model = tf.keras.models.load_model("..//models//cancer_cell_gen_methy_model_no_norm")
cancer_cell_gen_mut_model = tf.keras.models.load_model("..//models//cancer_cell_gen_mut_model_no_norm")


















In [20]:
pubchem_drugs_rdkit_model = tf.keras.models.load_model("..//models//pubchem_drugs_rdkit_model_no_norm")





In [21]:
std = StandardScaler()
# extract drug features
drug_features_train = pubchem_drugs_rdkit_model(x_train["drug_id"].values).numpy().astype("float32")
drug_features_valid = pubchem_drugs_rdkit_model(x_valid["drug_id"].values).numpy().astype("float32")


In [22]:
drug_features_train = std.fit_transform(drug_features_train)
drug_features_valid = std.transform(drug_features_valid)

In [23]:
# extract copy number features
omics_copy_number_train = cancer_copy_number_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float32")
omics_copy_number_valid = cancer_copy_number_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float32")

# extract gen expr features
omics_gen_expr_train = cancer_cell_gen_expr_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float32")
omics_gen_expr_valid = cancer_cell_gen_expr_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float32")

omics_gen_copy_number_gen_expr_train = np.concatenate([np.expand_dims(omics_copy_number_train, -1),
                                                      np.expand_dims(omics_gen_expr_train, -1)], axis = -1)

omics_gen_copy_number_gen_expr_valid = np.concatenate([np.expand_dims(omics_copy_number_valid, -1),
                                                      np.expand_dims(omics_gen_expr_valid, -1)], axis = -1)

# extract gen methylation features
omics_gen_methyl_train = cancer_cell_gen_methy_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float32")
omics_gen_methyl_valid = cancer_cell_gen_methy_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float32")

# extract gen mutation features
with tf.device('/cpu:0'):
    omics_gen_mut_train = cancer_cell_gen_mut_model.predict(x_train["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float32")
    omics_gen_mut_valid = cancer_cell_gen_mut_model.predict(x_valid["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float32")

smile_strings_train = x_train["Smiles"].values.reshape(-1,1)
smile_strings_valid = x_valid["Smiles"].values.reshape(-1,1)



In [24]:
# # Extract features
# with tf.device('/cpu:0'):
#     Train_all_features = extract_model.predict([train_gcn_feats, train_adj_list,omics_gen_expr_train, omics_gen_methyl_train, omics_gen_mut_train], batch_size = 32)

In [25]:
with tf.device('/cpu:0'):
    valid_all_featres = extract_model.predict([ valid_gcn_feats, valid_adj_list, omics_gen_expr_valid, omics_gen_methyl_valid, omics_gen_mut_valid], batch_size = 32)



In [26]:
# # feature output order
# # gene_expr_output, methyl_out, mutation_out, drug_output

# # Extract the embeddings separately for features
# Train_gene = Train_all_features[0]
# Train_methyl = Train_all_features[1]
# Train_mut = Train_all_features[2]
# Train_drug = Train_all_features[3]

In [27]:
Valid_gene = valid_all_featres[0]
Valid_methyl = valid_all_featres[1]
Valid_mut = valid_all_featres[2]
Valid_drug = valid_all_featres[3]

In [28]:
Valid_drug.shape

(17316, 100)

In [29]:
# # save the extracted embeddings
# np.save('..//saved_output_data/Train_gene_new_split_CDR.npy', Train_gene)
# np.save('..//saved_output_data/Train_methyl_new_split_CDR.npy', Train_methyl)
# np.save('..//saved_output_data/Train_mut_new_split_CDR.npy', Train_mut)
# np.save('..//saved_output_data/Train_drug_new_split_CDR.npy', Train_drug)

np.save('..//saved_output_data/Valid_gene_new_split_CDR.npy', Valid_gene)
np.save('..//saved_output_data/Valid_methyl_new_split_CDR.npy', Valid_methyl)
np.save('..//saved_output_data/Valid_mut_new_split_CDR.npy', Valid_mut)
np.save('..//saved_output_data/Valid_drug_new_split_CDR.npy', Valid_drug)

In [30]:
# y_train = y_train.reshape(-1,1)

In [31]:
# y_train

In [32]:
# y_train.shape

In [33]:
y_valid = y_valid.reshape(-1,1)

In [34]:
y_valid

array([[ 3.143044],
       [ 3.213836],
       [ 1.036346],
       ...,
       [ 1.73393 ],
       [-2.714591],
       [ 1.215302]])

In [35]:
np.mean(y_valid), np.std(y_valid)

(2.063824293081543, 2.8309603843576316)

In [None]:
# (2.063824293081543, 2.8309603843576316)

In [36]:
np.save('..//saved_output_data/Valid_y.npy', y_valid)
# np.save('..//saved_output_data/Train_y.npy', y_train)