In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow import keras
from tensorflow.python.keras import backend as K
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
import scipy.sparse as sp

In [None]:
drugs_cell_lines_ic50_df = pd.read_csv("data//drugs_cell_lines_ic50.csv")

In [None]:
pubchem_drugs_smiles_df = pd.read_csv('data//drugs_smile_strings.csv')

In [None]:
drugs_smiles_cell_lines_ic50_df = pd.merge(drugs_cell_lines_ic50_df, pubchem_drugs_smiles_df, 
                                             on = "drug_id")

In [None]:
drugs_smiles_cell_lines_ic50_df = drugs_smiles_cell_lines_ic50_df[["drug_id", "Cancer_Cell_Line", "Smiles", "IC50"]]

In [None]:
drugs_smiles_cell_lines_ic50_df.dtypes

In [None]:
drugs_smiles_cell_lines_ic50_df["drug_id"] = drugs_smiles_cell_lines_ic50_df["drug_id"].astype(object)

In [None]:
drugs_smiles_cell_lines_ic50_df.shape

In [None]:
with open("data//drug_gcn_features.pickle", "rb") as f:
    dict_features = pickle.load(f)

In [None]:
with open("data//drug_gcn_normalized_adj_mats.pickle", "rb") as f:
    dict_normalized_adj_mats = pickle.load(f)

In [None]:
dualgcn_train = pd.read_csv("data//DualGCN_Embedding_train.csv")

In [None]:
dualgcn_test = pd.read_csv("data//DualGCN_Embedding_test.csv")

In [None]:
pubchem_to_drugs_df = pd.read_csv('data//1.Drug_listMon Jun 24 09_00_55 2019.csv')

In [None]:
pubchem_to_drugs_df = pubchem_to_drugs_df[["drug_id", "PubCHEM"]]

In [None]:
pubchem_to_drugs_df.dtypes

In [None]:
pubchem_to_drugs_df["PubCHEM"] = [val if str(val).isdigit() else np.nan for val in pubchem_to_drugs_df["PubCHEM"] ]

In [None]:
pubchem_to_drugs_df = pubchem_to_drugs_df.dropna()

In [None]:
pubchem_to_drugs_df.dtypes

In [None]:
pubchem_to_drugs_df["drug_id"] = pubchem_to_drugs_df["drug_id"].astype(str)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(drugs_smiles_cell_lines_ic50_df.drop(["IC50"],1), drugs_smiles_cell_lines_ic50_df["IC50"].values, 
                                                     test_size = 0.20, random_state = 42)

In [None]:
dualgcn_train["Drug_ID"] = dualgcn_train["Drug_ID"].astype(str)

In [None]:
dualgcn_test["Drug_ID"] = dualgcn_test["Drug_ID"].astype(str)

In [None]:
pubchem_to_drugs_df.dtypes

In [None]:
dualgcn_train = pubchem_to_drugs_df.merge(dualgcn_train, left_on = ["PubCHEM"], right_on = ["Drug_ID"])

In [None]:
dualgcn_train = dualgcn_train[['Cell_Line', 'drug_id']]

In [None]:
dualgcn_test = pubchem_to_drugs_df.merge(dualgcn_test, left_on = ["PubCHEM"], right_on = ["Drug_ID"])

In [None]:
dualgcn_test = dualgcn_test[['Cell_Line', 'drug_id']]

In [None]:
dualgcn_train.dtypes

In [None]:
x_train.dtypes

In [None]:
# dualgcn_train

In [None]:
# x_train['drug_id'].values[0]

In [None]:
x_train['drug_id'] = x_train['drug_id'].astype(str)

In [None]:
x_valid['drug_id'] = x_valid['drug_id'].astype(str)

In [None]:
x_train_valid_feats = pd.concat([x_train, x_valid], ignore_index = True)

In [None]:
y_train_valid = pd.concat([pd.DataFrame(y_train.reshape(-1,1)), pd.DataFrame(y_valid.reshape(-1,1))], ignore_index = True)

In [None]:
combo_train_valid = pd.concat([x_train_valid_feats, y_train_valid], 1)

In [None]:
combo_train_valid.head()

In [None]:
combo_train_valid.columns = ['drug_id', 'Cancer_Cell_Line', 'Smiles', 'IC50']

In [None]:
# filter x_train x _valid here
x_y_train = combo_train_valid.merge(dualgcn_train, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])

In [None]:
x_y_test = combo_train_valid.merge(dualgcn_test, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])

In [None]:
x_train.dtypes

In [None]:
x_train, x_valid, y_train, y_valid = x_y_train.drop(["IC50", 'Cell_Line'],1), x_y_test.drop(["IC50", 'Cell_Line'], 1), x_y_train["IC50"].values, x_y_test["IC50"].values

In [None]:
train_gcn_feats = []
train_adj_list = []
for drug_id in x_train["drug_id"].values:
    train_gcn_feats.append(dict_features[drug_id])
    train_adj_list.append(dict_normalized_adj_mats[drug_id])

In [None]:
valid_gcn_feats = []
valid_adj_list = []
for drug_id in x_valid["drug_id"].values:
    valid_gcn_feats.append(dict_features[drug_id])
    valid_adj_list.append(dict_normalized_adj_mats[drug_id])

In [None]:
import numpy as np

In [None]:
train_gcn_feats = np.array(train_gcn_feats).astype("float16")
valid_gcn_feats = np.array(valid_gcn_feats).astype("float16")

In [None]:
train_adj_list = np.array(train_adj_list).astype("float16")
valid_adj_list = np.array(valid_adj_list).astype("float16")

In [None]:
# load models
# omic models
cancer_copy_number_model = tf.keras.models.load_model("models//cancer_copy_number_model_no_norm_common")
cancer_cell_gen_expr_model = tf.keras.models.load_model("models//cancer_cell_gen_expr_model_no_norm_common")
cancer_cell_gen_methy_model = tf.keras.models.load_model("models//cancer_cell_gen_methy_model_no_norm")
cancer_cell_gen_mut_model = tf.keras.models.load_model("models//cancer_cell_gen_mut_model_no_norm")

In [None]:
# load models
# drug models
pubchem_drugs_rdkit_model = tf.keras.models.load_model("models//pubchem_drugs_rdkit_model_no_norm")

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
std = StandardScaler()

In [None]:
# extract drug features
drug_features_train = pubchem_drugs_rdkit_model(x_train["drug_id"].values).numpy().astype("float32")
drug_features_valid = pubchem_drugs_rdkit_model(x_valid["drug_id"].values).numpy().astype("float32")

# drug_features_train = pubchem_drugs_rdkit_model(x_train["drug_id"].values)
# drug_features_valid = pubchem_drugs_rdkit_model(x_valid["drug_id"].values)

In [None]:
np.isinf(drug_features_train).sum()

In [None]:
drug_features_train = std.fit_transform(drug_features_train)

In [None]:
drug_features_valid = std.transform(drug_features_valid)

In [None]:
# extract copy number features
omics_copy_number_train = cancer_copy_number_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_copy_number_valid = cancer_copy_number_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")

In [None]:
# extract gen expr features
omics_gen_expr_train = cancer_cell_gen_expr_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_expr_valid = cancer_cell_gen_expr_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")

In [None]:
# extract gen methylation features
omics_gen_methyl_train = cancer_cell_gen_methy_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_methyl_valid = cancer_cell_gen_methy_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")

In [None]:
# extract gen mutation features
with tf.device('/cpu:0'):
    omics_gen_mut_train = cancer_cell_gen_mut_model.predict(x_train["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float16")
    omics_gen_mut_valid = cancer_cell_gen_mut_model.predict(x_valid["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float16")

In [None]:
smile_strings_train = x_train["Smiles"].values.reshape(-1,1)
smile_strings_valid = x_valid["Smiles"].values.reshape(-1,1)

In [None]:
input_gcn_features = tf.keras.layers.Input(shape = (100, 75))
input_norm_adj_mat = tf.keras.layers.Input(shape = (100, 100))
mult_1 = tf.keras.layers.Dot(1)([input_norm_adj_mat, input_gcn_features])
dense_layer_gcn = tf.keras.layers.Dense(256, activation = "relu")
dense_out = dense_layer_gcn(mult_1)
dense_out = tf.keras.layers.BatchNormalization()(dense_out)
dense_out = tf.keras.layers.Dropout(0.2)(dense_out)
mult_2 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
dense_layer_gcn = tf.keras.layers.Dense(256, activation = "relu")
dense_out = dense_layer_gcn(mult_2)
dense_out = tf.keras.layers.BatchNormalization()(dense_out)
dense_out = tf.keras.layers.Dropout(0.2)(dense_out)

dense_layer_gcn = tf.keras.layers.Dense(100, activation = "relu")
mult_3 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
dense_out = dense_layer_gcn(mult_3)
dense_out = tf.keras.layers.BatchNormalization()(dense_out)
dense_out = tf.keras.layers.Dropout(0.2)(dense_out)

dense_out = tf.keras.layers.GlobalAvgPool1D()(dense_out)

In [None]:
input_gen_methy = tf.keras.layers.Input(shape = (omics_gen_methyl_train.shape[1],))

In [None]:
gen_methy_layer = tf.keras.layers.Dense(256, activation = "tanh")

In [None]:
gen_methy_emb = gen_methy_layer(input_gen_methy)
gen_methy_emb = tf.keras.layers.BatchNormalization()(gen_methy_emb)
gen_methy_emb = tf.keras.layers.Dropout(0.2)(gen_methy_emb)

In [None]:
gen_methy_layer = tf.keras.layers.Dense(100, activation = "relu")
gen_methy_emb = gen_methy_layer(gen_methy_emb)

In [None]:
all_feats = tf.keras.layers.Concatenate()([gen_methy_emb, dense_out])

In [None]:
x = tf.keras.layers.Dense(300,activation = 'tanh')(all_feats)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Lambda(lambda x: K.expand_dims(x,axis=-1))(x)
x = tf.keras.layers.Lambda(lambda x: K.expand_dims(x,axis=1))(x)
x = tf.keras.layers.Conv2D(filters=30, kernel_size=(1,150),strides=(1, 1), activation = 'relu',padding='valid')(x)
x = tf.keras.layers.MaxPooling2D(pool_size=(1,2))(x)
x = tf.keras.layers.Conv2D(filters=10, kernel_size=(1,5),strides=(1, 1), activation = 'relu',padding='valid')(x)
x = tf.keras.layers.MaxPooling2D(pool_size=(1,3))(x)
x = tf.keras.layers.Conv2D(filters=5, kernel_size=(1,5),strides=(1, 1), activation = 'relu',padding='valid')(x)
x = tf.keras.layers.MaxPooling2D(pool_size=(1,3))(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dropout(0.2)(x)

In [None]:
final_out_layer = tf.keras.layers.Dense(1)

In [None]:
final_out = final_out_layer(x)

In [None]:
simplecdr = tf.keras.models.Model([input_gcn_features, input_norm_adj_mat,
                                   input_gen_methy], final_out)

In [None]:
simplecdr.compile(loss = tf.keras.losses.MeanSquaredError(), 
                    optimizer = tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), 
                    metrics = [tf.keras.metrics.RootMeanSquaredError()])

In [None]:
# First the data - let's use a train set of 6k - then the rest will be test here (from the remaining values, use 3k for training the stacker, 1k for validation of it, and the rest ~7k as the final test data). 
# drug feats
new_train_gcn_feats = valid_gcn_feats[:10000, :,:]
new_test_gcn_feats = valid_gcn_feats[10000:, :,:]
print(new_train_gcn_feats.shape, new_test_gcn_feats.shape)

# drug adj info
new_train_adj_list = valid_adj_list[:10000, :,:]
new_test_adj_list = valid_adj_list[10000:, :,:]
print(new_train_adj_list.shape, new_test_adj_list.shape)

# expression
new_omics_gen_expr_train = omics_gen_expr_valid[:10000, :]
new_omics_gen_expr_test = omics_gen_expr_valid[10000:,:]
print(new_omics_gen_expr_train.shape, new_omics_gen_expr_test.shape)

# methylation
new_omics_gen_methyl_train = omics_gen_methyl_valid[:10000, :]
new_omics_gen_methyl_test = omics_gen_methyl_valid[10000:,:]
print(new_omics_gen_methyl_train.shape, new_omics_gen_methyl_test.shape)

# mutation
new_omics_gen_mut_train = omics_gen_mut_valid[:10000, :]
new_omics_gen_mut_test = omics_gen_mut_valid[10000:,:]
print(new_omics_gen_mut_train.shape, new_omics_gen_mut_test.shape)

# y
new_y_train = y_valid[:10000,]
new_y_test = y_valid[10000:,]
print(new_y_train.shape, new_y_test.shape)

In [None]:
simpleCDR_original_model = tf.keras.models.load_model("models//simple_cdr_more_new_splits")
simpleCDR_original_model.summary()

In [None]:
simplecdr.summary()

In [None]:
# unpack the weights for the warm start

# first drugs
Dense_1_weights, Dense_1_bias = simpleCDR_original_model.layers[3].get_weights()
bn_1_weights = simpleCDR_original_model.layers[4].get_weights()
Dense_2_weights, Dense_2_bias = simpleCDR_original_model.layers[7].get_weights()
bn_2_weights = simpleCDR_original_model.layers[9].get_weights()
Dense_3_weights, Dense_3_bias = simpleCDR_original_model.layers[19].get_weights()
bn_3_weights = simpleCDR_original_model.layers[23].get_weights()

# # methylation
Dense_1_weights_methyl, Dense_1_bias_methyl = simpleCDR_original_model.layers[16].get_weights()
bn_1_weights_methyl = simpleCDR_original_model.layers[20].get_weights()
Dense_2_weights_methyl, Dense_2_bias_methyl = simpleCDR_original_model.layers[28].get_weights()

# once concatenated
conv2d_1_weights_concat, conv2d_1_bias_concat = simpleCDR_original_model.layers[37].get_weights()
conv2d_2_weights_concat, conv2d_2_bias_concat = simpleCDR_original_model.layers[39].get_weights()
conv2d_3_weights_concat, conv2d_3_bias_concat = simpleCDR_original_model.layers[41].get_weights()
Dense_final_weights_concat, Dense_final_bias_concat = simpleCDR_original_model.layers[46].get_weights()

In [None]:
simpleCDR_original_model.summary()

In [None]:
# Set weights

# drugs
simplecdr.layers[3].set_weights((Dense_1_weights, Dense_1_bias))
simplecdr.layers[4].set_weights(bn_1_weights)
simplecdr.layers[7].set_weights((Dense_2_weights, Dense_2_bias))
simplecdr.layers[8].set_weights(bn_2_weights)
simplecdr.layers[13].set_weights((Dense_3_weights, Dense_3_bias))
simplecdr.layers[15].set_weights(bn_3_weights)

# methylation
simplecdr.layers[12].set_weights((Dense_1_weights_methyl, Dense_1_bias_methyl))
simplecdr.layers[14].set_weights(bn_1_weights_methyl)
simplecdr.layers[18].set_weights((Dense_2_weights_methyl, Dense_2_bias_methyl))

# concat
simplecdr.layers[25].set_weights((conv2d_1_weights_concat, conv2d_1_bias_concat))
simplecdr.layers[27].set_weights((conv2d_2_weights_concat, conv2d_2_bias_concat))
simplecdr.layers[29].set_weights((conv2d_3_weights_concat, conv2d_3_bias_concat))
simplecdr.layers[34].set_weights((Dense_final_weights_concat, Dense_final_bias_concat))

In [None]:
%%time
history = simplecdr.fit([new_train_gcn_feats, new_train_adj_list,
                         new_omics_gen_methyl_train], new_y_train.reshape(-1,1), 
                         
          batch_size = 64, epochs = 1000, verbose = 1,
                         
          validation_split=0.2,
        callbacks = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 20, restore_best_weights=True,
                                                       mode = "min"), 
         validation_batch_size = 512, shuffle = True)

In [None]:
simplecdr.save("models//cdr_for_stacker_new_splits_ablation")

In [None]:
val_preds = simplecdr.predict([ new_test_gcn_feats, new_test_adj_list,
                           new_omics_gen_methyl_test])

In [None]:
preds_data = pd.DataFrame(np.hstack((new_y_test.reshape(-1,1), val_preds)), columns = ['True_y', 'Predicted_y'])

In [None]:
preds_data.to_csv("data//cdr_data_new_splits_ablation.csv", index = False)