In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import keras.backend as K
import scipy.sparse as sp
import random
from sklearn.preprocessing import StandardScaler

In [None]:
# load the trained model
model = tf.keras.models.load_model("models//simple_cdr_more_new_splits")

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes = True)

In [None]:
drug_output = model.get_layer("global_average_pooling1d").output

In [None]:
drug_output.shape

In [None]:
gene_expr_output = model.get_layer("dense_4").output
gene_expr_output.shape

In [None]:
methyl_out = model.get_layer("dense_7").output
methyl_out.shape

In [None]:
mutation_out = model.get_layer("flatten").output
mutation_out.shape

In [None]:
extract_model = tf.keras.models.Model(model.input, [gene_expr_output, methyl_out, mutation_out, drug_output])

In [None]:
# Import and prep the data

In [None]:
# import the response data
drugs_cell_lines_ic50_df = pd.read_csv("data/drugs_cell_lines_ic50.csv") 
print(drugs_cell_lines_ic50_df.shape)
# import the drug smiles data
pubchem_drugs_smiles_df = pd.read_csv('data/drugs_smile_strings.csv')
print(pubchem_drugs_smiles_df.shape)

drugs_smiles_cell_lines_ic50_df = pd.merge(drugs_cell_lines_ic50_df, pubchem_drugs_smiles_df, 
                                             on = "drug_id")
drugs_smiles_cell_lines_ic50_df = drugs_smiles_cell_lines_ic50_df[["drug_id", "Cancer_Cell_Line", "Smiles", "IC50"]]
drugs_smiles_cell_lines_ic50_df.dtypes
drugs_smiles_cell_lines_ic50_df["drug_id"] = drugs_smiles_cell_lines_ic50_df["drug_id"].astype(object)


In [None]:
drugs_smiles_cell_lines_ic50_df.shape

In [None]:
drugs_smiles_cell_lines_ic50_df.head()

In [None]:
import pickle

# Get drug features and adjacency information
with open("data/drug_gcn_features.pickle", "rb") as f:
    dict_features = pickle.load(f)

with open("data/drug_gcn_normalized_adj_mats.pickle", "rb") as f:
    dict_normalized_adj_mats = pickle.load(f)

In [None]:
dualgcn_train = pd.read_csv("data/DualGCN_Embedding_train.csv")
dualgcn_test = pd.read_csv("data/DualGCN_Embedding_test.csv")
pubchem_to_drugs_df = pd.read_csv('data/1.Drug_listMon Jun 24 09_00_55 2019.csv')
pubchem_to_drugs_df = pubchem_to_drugs_df[["drug_id", "PubCHEM"]]
pubchem_to_drugs_df["PubCHEM"] = [val if str(val).isdigit() else np.nan for val in pubchem_to_drugs_df["PubCHEM"] ]
pubchem_to_drugs_df = pubchem_to_drugs_df.dropna()
pubchem_to_drugs_df["drug_id"] = pubchem_to_drugs_df["drug_id"].astype(str)
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(drugs_smiles_cell_lines_ic50_df.drop(["IC50"],1), drugs_smiles_cell_lines_ic50_df["IC50"].values, 
                                                     test_size = 0.20, random_state = 42)

In [None]:
dualgcn_train["Drug_ID"] = dualgcn_train["Drug_ID"].astype(str)
dualgcn_test["Drug_ID"] = dualgcn_test["Drug_ID"].astype(str)
dualgcn_train = pubchem_to_drugs_df.merge(dualgcn_train, left_on = ["PubCHEM"], right_on = ["Drug_ID"])
np.mean(dualgcn_train['PubCHEM'] == dualgcn_train['Drug_ID'])
dualgcn_train = dualgcn_train[['Cell_Line', 'drug_id']]
dualgcn_test = pubchem_to_drugs_df.merge(dualgcn_test, left_on = ["PubCHEM"], right_on = ["Drug_ID"])
dualgcn_test = dualgcn_test[['Cell_Line', 'drug_id']]
x_train['drug_id'] = x_train['drug_id'].astype(str)
x_valid['drug_id'] = x_valid['drug_id'].astype(str)
x_train_valid_feats = pd.concat([x_train, x_valid], ignore_index = True)
y_train_valid = pd.concat([pd.DataFrame(y_train.reshape(-1,1)), pd.DataFrame(y_valid.reshape(-1,1))], ignore_index = True)
combo_train_valid = pd.concat([x_train_valid_feats, y_train_valid], 1)
combo_train_valid.columns = ['drug_id', 'Cancer_Cell_Line', 'Smiles', 'IC50']
# filter x_train x _valid here
x_y_train = combo_train_valid.merge(dualgcn_train, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])
x_y_test = combo_train_valid.merge(dualgcn_test, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])

x_train, x_valid, y_train, y_valid = x_y_train.drop(["IC50", 'Cell_Line'],1), x_y_test.drop(["IC50", 'Cell_Line'], 1), x_y_train["IC50"].values, x_y_test["IC50"].values

train_gcn_feats = []
train_adj_list = []
for drug_id in x_train["drug_id"].values:
    train_gcn_feats.append(dict_features[drug_id])
    train_adj_list.append(dict_normalized_adj_mats[drug_id])

valid_gcn_feats = []
valid_adj_list = []
for drug_id in x_valid["drug_id"].values:
    valid_gcn_feats.append(dict_features[drug_id])
    valid_adj_list.append(dict_normalized_adj_mats[drug_id])

train_gcn_feats = np.array(train_gcn_feats).astype("float32")
valid_gcn_feats = np.array(valid_gcn_feats).astype("float32")

train_adj_list = np.array(train_adj_list).astype("float32")
valid_adj_list = np.array(valid_adj_list).astype("float32")


In [None]:
cancer_copy_number_model = tf.keras.models.load_model("models//cancer_copy_number_model_no_norm_common")
cancer_cell_gen_expr_model = tf.keras.models.load_model("models//cancer_cell_gen_expr_model_no_norm_common")
cancer_cell_gen_methy_model = tf.keras.models.load_model("models//cancer_cell_gen_methy_model_no_norm")
cancer_cell_gen_mut_model = tf.keras.models.load_model("models//cancer_cell_gen_mut_model_no_norm")


In [None]:
pubchem_drugs_rdkit_model = tf.keras.models.load_model("models//pubchem_drugs_rdkit_model_no_norm")

In [None]:
std = StandardScaler()
# extract drug features
drug_features_train = pubchem_drugs_rdkit_model(x_train["drug_id"].values).numpy().astype("float32")
drug_features_valid = pubchem_drugs_rdkit_model(x_valid["drug_id"].values).numpy().astype("float32")


In [None]:
drug_features_train = std.fit_transform(drug_features_train)
drug_features_valid = std.transform(drug_features_valid)

In [None]:
# extract copy number features
omics_copy_number_train = cancer_copy_number_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float32")
omics_copy_number_valid = cancer_copy_number_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float32")

# extract gen expr features
omics_gen_expr_train = cancer_cell_gen_expr_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float32")
omics_gen_expr_valid = cancer_cell_gen_expr_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float32")

omics_gen_copy_number_gen_expr_train = np.concatenate([np.expand_dims(omics_copy_number_train, -1),
                                                      np.expand_dims(omics_gen_expr_train, -1)], axis = -1)

omics_gen_copy_number_gen_expr_valid = np.concatenate([np.expand_dims(omics_copy_number_valid, -1),
                                                      np.expand_dims(omics_gen_expr_valid, -1)], axis = -1)

# extract gen methylation features
omics_gen_methyl_train = cancer_cell_gen_methy_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float32")
omics_gen_methyl_valid = cancer_cell_gen_methy_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float32")

# extract gen mutation features
with tf.device('/cpu:0'):
    omics_gen_mut_train = cancer_cell_gen_mut_model.predict(x_train["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float32")
    omics_gen_mut_valid = cancer_cell_gen_mut_model.predict(x_valid["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float32")

smile_strings_train = x_train["Smiles"].values.reshape(-1,1)
smile_strings_valid = x_valid["Smiles"].values.reshape(-1,1)

In [None]:
# Extract features
Train_all_features = extract_model.predict([train_gcn_feats, train_adj_list,omics_gen_expr_train, 
                         omics_gen_methyl_train, omics_gen_mut_train])

In [None]:
valid_all_featres = extract_model.predict([ valid_gcn_feats, valid_adj_list, omics_gen_expr_valid, 
                           omics_gen_methyl_valid, omics_gen_mut_valid])

In [None]:
# feature output order
# gene_expr_output, methyl_out, mutation_out, drug_output

# Extract the embeddings separately for features
Train_gene = Train_all_features[0]
Train_methyl = Train_all_features[1]
Train_mut = Train_all_features[2]
Train_drug = Train_all_features[3]

In [None]:
Valid_gene = valid_all_featres[0]
Valid_methyl = valid_all_featres[1]
Valid_mut = valid_all_featres[2]
Valid_drug = valid_all_featres[3]

In [None]:
Valid_drug.shape

In [None]:
# save the extracted embeddings
np.save('saved_output_data/Train_gene_new_split_CDR.npy', Train_gene)
np.save('saved_output_data/Train_methyl_new_split_CDR.npy', Train_methyl)
np.save('saved_output_data/Train_mut_new_split_CDR.npy', Train_mut)
np.save('saved_output_data/Train_drug_new_split_CDR.npy', Train_drug)

np.save('saved_output_data/Valid_gene_new_split_CDR.npy', Valid_gene)
np.save('saved_output_data/Valid_methyl_new_split_CDR.npy', Valid_methyl)
np.save('saved_output_data/Valid_mut_new_split_CDR.npy', Valid_mut)
np.save('saved_output_data/Valid_drug_new_split_CDR.npy', Valid_drug)

In [None]:
y_train = y_train.reshape(-1,1)

In [None]:
y_valid = y_valid.reshape(-1,1)

In [None]:
np.save('saved_output_data/Valid_y.npy', y_valid)
np.save('saved_output_data/Train_y.npy', y_train)