In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow import keras
from tensorflow.python.keras import backend as K
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp
import random

2024-07-11 16:44:31.847669: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-11 16:44:32.109097: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-11 16:44:32.109154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-11 16:44:32.161690: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-11 16:44:32.245878: I tensorflow/core/platform/cpu_feature_guar

In [2]:
drugs_cell_lines_ic50_df = pd.read_csv("..//data//drugs_cell_lines_ic50.csv")

In [3]:
drugs_cell_lines_ic50_df.shape

(208734, 3)

In [4]:
drugs_cell_lines_ic50_df.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,IC50
0,1001,ACH-002137,7.258918
1,1004,ACH-002137,-3.802467
2,1005,ACH-002137,4.146364
3,1006,ACH-002137,3.171367
4,1007,ACH-002137,-4.959442


In [5]:
pubchem_drugs_smiles_df = pd.read_csv('..//data//drugs_smile_strings.csv')

In [6]:
pubchem_drugs_smiles_df.head()

Unnamed: 0,drug_id,Smiles
0,1242,COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O
1,179,O=c1[nH]cc(F)c(=O)[nH]1
2,86,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4c[nH]c5ccccc45)...
3,55,COc1cc(-c2nn(C3CCC(N4CCN(C(C)=O)CC4)CC3)c3ncnc...
4,1001,NC(=O)c1ncn(C2OC(COP(=O)(O)O)C(O)C2O)c1N


In [7]:
pubchem_drugs_smiles_df.shape

(238, 2)

In [8]:
drugs_smiles_cell_lines_ic50_df = pd.merge(drugs_cell_lines_ic50_df, pubchem_drugs_smiles_df, 
                                             on = "drug_id")

In [9]:
drugs_smiles_cell_lines_ic50_df = drugs_smiles_cell_lines_ic50_df[["drug_id", "Cancer_Cell_Line", "Smiles", "IC50"]]

In [10]:
drugs_smiles_cell_lines_ic50_df.dtypes

drug_id               int64
Cancer_Cell_Line     object
Smiles               object
IC50                float64
dtype: object

In [11]:
drugs_smiles_cell_lines_ic50_df["drug_id"] = drugs_smiles_cell_lines_ic50_df["drug_id"].astype(object)

In [12]:
with open("..//data//drug_gcn_features.pickle", "rb") as f:
    dict_features = pickle.load(f)

In [13]:
with open("..//data//drug_gcn_normalized_adj_mats.pickle", "rb") as f:
    dict_normalized_adj_mats = pickle.load(f)

In [14]:
dualgcn_train = pd.read_csv("..//data//DualGCN_Embedding_train.csv")

In [15]:
dualgcn_test = pd.read_csv("..//data//DualGCN_Embedding_test.csv")

In [16]:
pubchem_to_drugs_df = pd.read_csv('..//data//1.Drug_listMon Jun 24 09_00_55 2019.csv')

In [17]:
pubchem_to_drugs_df.head()

Unnamed: 0,drug_id,Name,Synonyms,Targets,Target pathway,PubCHEM,Sample Size,Count
0,1242,(5Z)-7-Oxozeaenol,"5Z-7-Oxozeaenol, LL-Z1640-2",TAK1,"Other, kinases",9863776,945,266
1,179,5-Fluorouracil,5-FU,Antimetabolite (DNA & RNA),Other,3385,968,266
2,86,A-443654,KIN001-139,"AKT1, AKT2, AKT3",PI3K/MTOR signaling,10172943,425,266
3,55,A-770041,KIN001-111,"LCK, FYN","Other, kinases",9549184,426,266
4,1001,AICA Ribonucleotide,"AICAR, N1-(b-D-Ribofuranosyl)-5-aminoimidazole...",AMPK agonist,Metabolism,65110,872,266


In [18]:
pubchem_to_drugs_df.shape

(266, 8)

In [19]:
pubchem_to_drugs_df = pubchem_to_drugs_df[["drug_id", "PubCHEM"]]

In [20]:
pubchem_to_drugs_df.dtypes

drug_id     int64
PubCHEM    object
dtype: object

In [21]:
pubchem_to_drugs_df["PubCHEM"] = [val if str(val).isdigit() else np.nan for val in pubchem_to_drugs_df["PubCHEM"] ]

In [22]:
pubchem_to_drugs_df = pubchem_to_drugs_df.dropna()

In [23]:
pubchem_to_drugs_df.dtypes

drug_id     int64
PubCHEM    object
dtype: object

In [24]:
pubchem_to_drugs_df.shape

(238, 2)

In [25]:
pubchem_to_drugs_df["drug_id"] = pubchem_to_drugs_df["drug_id"].astype(str)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
x_train, x_valid, y_train, y_valid = train_test_split(drugs_smiles_cell_lines_ic50_df.drop(["IC50"], axis = 1), drugs_smiles_cell_lines_ic50_df["IC50"].values, 
                                                     test_size = 0.20, random_state = 42)

In [28]:
dualgcn_train["Drug_ID"] = dualgcn_train["Drug_ID"].astype(str)

In [29]:
dualgcn_test["Drug_ID"] = dualgcn_test["Drug_ID"].astype(str)

In [30]:
pubchem_to_drugs_df.dtypes

drug_id    object
PubCHEM    object
dtype: object

In [31]:
dualgcn_train = pubchem_to_drugs_df.merge(dualgcn_train, left_on = ["PubCHEM"], right_on = ["Drug_ID"])

In [32]:
dualgcn_train = dualgcn_train[['Cell_Line', 'drug_id']]

In [33]:
dualgcn_train.head()

Unnamed: 0,Cell_Line,drug_id
0,ACH-000070,1242
1,ACH-000105,1242
2,ACH-000981,1242
3,ACH-000061,1242
4,ACH-000995,1242


In [34]:
dualgcn_test = pubchem_to_drugs_df.merge(dualgcn_test, left_on = ["PubCHEM"], right_on = ["Drug_ID"])

In [35]:
dualgcn_test.head()

Unnamed: 0,drug_id,PubCHEM,Cell_Line,Drug_ID,Target,TCGA_Type,DrugEmbedding_Dim0,DrugEmbedding_Dim1,DrugEmbedding_Dim2,DrugEmbedding_Dim3,...,CellEmbedding_Dim246,CellEmbedding_Dim247,CellEmbedding_Dim248,CellEmbedding_Dim249,CellEmbedding_Dim250,CellEmbedding_Dim251,CellEmbedding_Dim252,CellEmbedding_Dim253,CellEmbedding_Dim254,CellEmbedding_Dim255
0,1242,9863776,ACH-000020,9863776,1.658465,ALL,-0.112977,-0.34584,0.000926,-0.024855,...,-0.176026,0.0543,-0.560812,-0.504415,-0.20714,-0.064926,-0.206338,-0.088848,-0.18441,-0.059895
1,1242,9863776,ACH-000142,9863776,-0.321683,BLCA,-0.112977,-0.34584,0.000926,-0.024855,...,-0.129487,0.0543,2.323304,0.499483,-0.20714,-0.064926,-0.206338,-0.088848,-0.18441,-0.025697
2,1242,9863776,ACH-000547,9863776,0.950017,BLCA,-0.112977,-0.34584,0.000926,-0.024855,...,0.242312,0.057235,1.133223,0.433553,-0.203844,-0.062081,-0.203995,-0.085297,-0.181583,-0.032253
3,1242,9863776,ACH-000242,9863776,1.190755,BLCA,-0.112977,-0.34584,0.000926,-0.024855,...,-0.175997,0.0543,2.544746,0.372183,-0.20714,-0.064926,-0.206338,-0.088848,-0.18441,0.0173
4,1242,9863776,ACH-000018,9863776,0.408545,BLCA,-0.112977,-0.34584,0.000926,-0.024855,...,-0.176026,0.0543,0.922048,-0.504415,-0.20714,-0.064926,-0.206338,-0.088848,-0.18441,-0.059895


In [36]:
dualgcn_test = dualgcn_test[['Cell_Line', 'drug_id']]

In [37]:
dualgcn_test.head()

Unnamed: 0,Cell_Line,drug_id
0,ACH-000020,1242
1,ACH-000142,1242
2,ACH-000547,1242
3,ACH-000242,1242
4,ACH-000018,1242


In [38]:
dualgcn_train.dtypes

Cell_Line    object
drug_id      object
dtype: object

In [39]:
x_train.dtypes

drug_id             object
Cancer_Cell_Line    object
Smiles              object
dtype: object

In [40]:
x_train['drug_id'] = x_train['drug_id'].astype(str)

In [41]:
x_valid['drug_id'] = x_valid['drug_id'].astype(str)

In [42]:
x_train_valid_feats = pd.concat([x_train, x_valid], ignore_index = True)

In [43]:
y_train_valid = pd.concat([pd.DataFrame(y_train.reshape(-1,1)), pd.DataFrame(y_valid.reshape(-1,1))], ignore_index = True)

In [44]:
combo_train_valid = pd.concat([x_train_valid_feats, y_train_valid], axis = 1)

In [45]:
combo_train_valid.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,Smiles,0
0,1030,ACH-000295,O=c1cc(-c2cccc3c2Sc2ccccc2S3)oc(N2CCOCC2)c1,3.143044
1,203,ACH-000292,Cc1ccc2nc(NCCN)c3ncc(C)n3c2c1,2.77262
2,1072,ACH-002273,NC(=O)C(CCC(F)(F)F)N(Cc1ccc(-c2ncon2)cc1F)S(=O...,3.809977
3,150,ACH-000631,CC(O)(CS(=O)(=O)c1ccc(F)cc1)C(=O)Nc1ccc(C#N)c(...,3.05303
4,282,ACH-000270,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,-0.367467


In [46]:
combo_train_valid.columns = ['drug_id', 'Cancer_Cell_Line', 'Smiles', 'IC50']

In [47]:
# filter x_train x _valid here
x_y_train = combo_train_valid.merge(dualgcn_train, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])

In [48]:
x_y_train.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,Smiles,IC50,Cell_Line
0,203,ACH-000292,Cc1ccc2nc(NCCN)c3ncc(C)n3c2c1,2.77262,ACH-000292
1,282,ACH-000270,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,-0.367467,ACH-000270
2,51,ACH-000087,Cc1nc(Nc2ncc(C(=O)Nc3c(C)cccc3Cl)s2)cc(N2CCN(C...,2.620152,ACH-000087
3,167,ACH-000948,NCC(=O)Nc1ccc(-n2nc(C(F)(F)F)cc2-c2ccc3c(ccc4c...,2.061684,ACH-000948
4,180,ACH-000971,CC=C(C)C(=O)OC1C(C)=C2C(C1OC(=O)CCCCCCC)C(C)(O...,-5.105745,ACH-000971


In [49]:
x_y_train.shape

(69214, 5)

In [50]:
x_y_test = combo_train_valid.merge(dualgcn_test, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])

In [51]:
x_y_test.shape

(17316, 5)

In [52]:
x_train, x_valid, y_train, y_valid = train_test_split(x_y_train.drop(["IC50", 'Cell_Line'],axis = 1),x_y_train["IC50"].values, random_state = 42, test_size = 0.2)

In [53]:
print("x train shape", x_train.shape)
print("x valid shape", x_valid.shape)
print("y train shape", y_train.shape)
print("y valid shape", y_valid.shape)

x train shape (55371, 3)
x valid shape (13843, 3)
y train shape (55371,)
y valid shape (13843,)


In [54]:
x_test, y_test = x_y_test.drop(["IC50", 'Cell_Line'], axis = 1), x_y_test["IC50"].values

In [55]:
print("x test shape", x_test.shape)
print("y test shape", y_test.shape)

x test shape (17316, 3)
y test shape (17316,)


In [56]:
np.mean(y_train), np.std(y_train), np.mean(y_valid),  np.std(y_valid)

(2.0558525880515073, 2.8441288895772825, 2.0230610204435453, 2.835856668835907)

In [57]:
# from deepcdr
# (2.0558525880515073, 2.8441288895772825, 2.0230610204435453, 2.835856668835907)

In [58]:
np.mean(y_test), np.std(y_test),

(2.063824293081543, 2.8309603843576316)

In [59]:
# from deepcdr
# (2.063824293081543, 2.8309603843576316)

In [60]:
x_train.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,Smiles
54743,180,ACH-000358,CC=C(C)C(=O)OC1C(C)=C2C(C1OC(=O)CCCCCCC)C(C)(O...
13257,260,ACH-000476,CCN1CCN(Cc2ccc(NC(=O)c3ccc(C)c(Oc4ccnc5[nH]ccc...
37343,308,ACH-000151,COc1cc2c(Oc3ccc(NC(=O)C4(C(=O)Nc5ccc(F)cc5)CC4...
33226,106,ACH-000218,COc1cc(N2CCN(C)CC2)ccc1Nc1ncc2c(n1)N(C)c1ccccc...
9313,1004,ACH-000210,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...


In [61]:
train_gcn_feats = []
train_adj_list = []
for drug_id in x_train["drug_id"].values:
    train_gcn_feats.append(dict_features[drug_id])
    train_adj_list.append(dict_normalized_adj_mats[drug_id])

In [62]:
valid_gcn_feats = []
valid_adj_list = []
for drug_id in x_valid["drug_id"].values:
    valid_gcn_feats.append(dict_features[drug_id])
    valid_adj_list.append(dict_normalized_adj_mats[drug_id])

In [63]:
test_gcn_feats = []
test_adj_list = []
for drug_id in x_test["drug_id"].values:
    test_gcn_feats.append(dict_features[drug_id])
    test_adj_list.append(dict_normalized_adj_mats[drug_id])

In [64]:
import numpy as np

In [65]:
%%time
train_gcn_feats = np.array(train_gcn_feats).astype("float16")
valid_gcn_feats = np.array(valid_gcn_feats).astype("float16")
test_gcn_feats = np.array(test_gcn_feats).astype("float16")

CPU times: user 2.6 s, sys: 656 ms, total: 3.26 s
Wall time: 3.27 s


In [66]:
%%time
train_adj_list = np.array(train_adj_list).astype("float16")
valid_adj_list = np.array(valid_adj_list).astype("float16")
test_adj_list = np.array(test_adj_list).astype("float16")

CPU times: user 3.44 s, sys: 888 ms, total: 4.33 s
Wall time: 4.34 s


In [67]:
# load models
# omic models
cancer_copy_number_model = tf.keras.models.load_model("..//models//cancer_copy_number_model_no_norm_common")
cancer_cell_gen_expr_model = tf.keras.models.load_model("..//models//cancer_cell_gen_expr_model_no_norm_common")
cancer_cell_gen_methy_model = tf.keras.models.load_model("..//models//cancer_cell_gen_methy_model_no_norm")
cancer_cell_gen_mut_model = tf.keras.models.load_model("..//models//cancer_cell_gen_mut_model_no_norm")

2024-07-11 16:45:00.147260: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31141 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:86:00.0, compute capability: 7.0


















In [68]:
# load models
# drug models
pubchem_drugs_rdkit_model = tf.keras.models.load_model("..//models//pubchem_drugs_rdkit_model_no_norm")





In [69]:
std = StandardScaler()

In [70]:
# extract drug features - does not seem like these are used in the network
drug_features_train = pubchem_drugs_rdkit_model(x_train["drug_id"].values).numpy().astype("float32")
drug_features_valid = pubchem_drugs_rdkit_model(x_valid["drug_id"].values).numpy().astype("float32")
drug_features_test = pubchem_drugs_rdkit_model(x_test["drug_id"].values).numpy().astype("float32")

# drug_features_train = pubchem_drugs_rdkit_model(x_train["drug_id"].values)
# drug_features_valid = pubchem_drugs_rdkit_model(x_valid["drug_id"].values)

In [71]:
drug_features_train.shape, drug_features_valid.shape, drug_features_test.shape

((55371, 106), (13843, 106), (17316, 106))

In [72]:
np.isinf(drug_features_train).sum()

0

In [73]:
drug_features_train = std.fit_transform(drug_features_train)

In [74]:
drug_features_valid = std.transform(drug_features_valid)
drug_features_test = std.transform(drug_features_test)

In [75]:
# extract copy number features
omics_copy_number_train = cancer_copy_number_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_copy_number_valid = cancer_copy_number_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_copy_number_test = cancer_copy_number_model(x_test["Cancer_Cell_Line"].values).numpy().astype("float16")

In [76]:
omics_copy_number_train.shape, omics_copy_number_valid.shape, omics_copy_number_test.shape

((55371, 691), (13843, 691), (17316, 691))

In [77]:
# extract gen expr features
omics_gen_expr_train = cancer_cell_gen_expr_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_expr_valid = cancer_cell_gen_expr_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_expr_test = cancer_cell_gen_expr_model(x_test["Cancer_Cell_Line"].values).numpy().astype("float16")

In [78]:
omics_gen_expr_train.shape, omics_gen_expr_valid.shape, omics_gen_expr_test.shape

((55371, 691), (13843, 691), (17316, 691))

In [79]:
omics_gen_copy_number_gen_expr_train = np.concatenate([np.expand_dims(omics_copy_number_train, -1),
                                                      np.expand_dims(omics_gen_expr_train, -1)], axis = -1)

In [80]:
omics_gen_copy_number_gen_expr_train.shape

(55371, 691, 2)

In [81]:
omics_gen_copy_number_gen_expr_valid = np.concatenate([np.expand_dims(omics_copy_number_valid, -1),
                                                      np.expand_dims(omics_gen_expr_valid, -1)], axis = -1)

In [82]:
omics_gen_copy_number_gen_expr_valid.shape

(13843, 691, 2)

In [83]:
omics_gen_copy_number_gen_expr_test = np.concatenate([np.expand_dims(omics_copy_number_test, -1),
                                                      np.expand_dims(omics_gen_expr_test, -1)], axis = -1)

In [84]:
omics_gen_copy_number_gen_expr_test.shape

(17316, 691, 2)

In [85]:
# extract gen methylation features
omics_gen_methyl_train = cancer_cell_gen_methy_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_methyl_valid = cancer_cell_gen_methy_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_methyl_test = cancer_cell_gen_methy_model(x_test["Cancer_Cell_Line"].values).numpy().astype("float16")

In [86]:
omics_gen_methyl_train.shape, omics_gen_methyl_valid.shape, omics_gen_methyl_test.shape

((55371, 808), (13843, 808), (17316, 808))

In [87]:
# extract gen mutation features
with tf.device('/cpu:0'):
    omics_gen_mut_train = cancer_cell_gen_mut_model.predict(x_train["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float16")
    omics_gen_mut_valid = cancer_cell_gen_mut_model.predict(x_valid["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float16")
    omics_gen_mut_test = cancer_cell_gen_mut_model.predict(x_test["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float16")



In [88]:
omics_gen_mut_train.shape, omics_gen_mut_valid.shape, omics_gen_mut_test.shape

((55371, 34673), (13843, 34673), (17316, 34673))

In [89]:
smile_strings_train = x_train["Smiles"].values.reshape(-1,1)
smile_strings_valid = x_valid["Smiles"].values.reshape(-1,1)
smile_strings_test = x_test["Smiles"].values.reshape(-1,1)

In [90]:
smile_strings_train.shape, smile_strings_valid.shape, smile_strings_test.shape

((55371, 1), (13843, 1), (17316, 1))

In [91]:
selected_info_common_cell_lines = "..//data//cellline_list.txt"
selected_info_common_genes = "..//data//gene_list.txt"

In [92]:
PPI_file = "..//data//PPI_network.txt"

In [93]:
with open(selected_info_common_cell_lines) as f:
    common_cell_lines = [item.strip() for item in f.readlines()]

In [94]:
len(common_cell_lines)

525

In [95]:
with open("..//data//common_genes.pickle", "rb") as f:
    common_genes = pickle.load(f)

In [96]:
len(common_genes)

691

In [97]:
len(np.unique(common_genes))

691

In [98]:
idx_dic={}
for index, item in enumerate(common_genes):
    idx_dic[item] = index

In [99]:
idx_dic['SUFU']

1

In [100]:
ppi_adj_info = [[] for item in common_genes] 

In [101]:
# will return for each gene what other gene is connected - PPIs
ppi_adj_info = [[] for item in common_genes] 
for line in open(PPI_file).readlines():
    gene1,gene2 = line.split('\t')[0],line.split('\t')[1]
    if (gene1 in common_genes) & (gene2 in common_genes):
        if idx_dic[gene1]<=idx_dic[gene2]:
            ppi_adj_info[idx_dic[gene1]].append(idx_dic[gene2])
            ppi_adj_info[idx_dic[gene2]].append(idx_dic[gene1])

In [102]:
len(ppi_adj_info)

691

In [103]:
len(ppi_adj_info[0])

247

In [104]:
len(common_genes)

691

In [105]:
def CelllineGraphAdjNorm(ppi_adj_info,common_genes = common_genes):
    # with open(selected_info_common_genes) as f:
    #     common_genes = [item.strip() for item in f.readlines()]
    nb_nodes = len(common_genes)
    adj_mat = np.zeros((nb_nodes,nb_nodes),dtype='float32')
    # print(adj_mat.shape)
    for i in range(len(ppi_adj_info)):
        # print(i)
        nodes = ppi_adj_info[i]
        for each in nodes:
            adj_mat[i,each] = 1

    # for checking if two sparse matrices are the same
    assert np.allclose(adj_mat,adj_mat.T)
    norm_adj = NormalizeAdj(adj_mat)
    return norm_adj 

In [106]:
def NormalizeAdj(adj):
    adj = adj + np.eye(adj.shape[0])
    d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0).toarray()
    a_norm = adj.dot(d).transpose().dot(d)
    return a_norm

In [107]:
ppi_adj = CelllineGraphAdjNorm(ppi_adj_info,common_genes)

In [108]:
ppi_adj.shape

(691, 691)

In [109]:
ppi_adj

array([[0.00403226, 0.00546522, 0.00684739, ..., 0.        , 0.003636  ,
        0.00582104],
       [0.00546522, 0.00740741, 0.        , ..., 0.        , 0.00492814,
        0.00788968],
       [0.00684739, 0.        , 0.01162791, ..., 0.        , 0.00617449,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.00510204, 0.        ,
        0.00654785],
       [0.003636  , 0.00492814, 0.00617449, ..., 0.        , 0.00327869,
        0.        ],
       [0.00582104, 0.00788968, 0.        , ..., 0.00654785, 0.        ,
        0.00840336]])

In [110]:
ppi_adj = np.expand_dims(ppi_adj,0)

In [111]:
ppi_adj.shape

(1, 691, 691)

In [112]:
omics_gen_copy_number_gen_expr_train_new = (ppi_adj@omics_gen_copy_number_gen_expr_train)
omics_gen_copy_number_gen_expr_valid_new = (ppi_adj@omics_gen_copy_number_gen_expr_valid)
omics_gen_copy_number_gen_expr_test_new = (ppi_adj@omics_gen_copy_number_gen_expr_test)

In [113]:
omics_gen_copy_number_gen_expr_train_new.shape, omics_gen_copy_number_gen_expr_valid_new.shape, omics_gen_copy_number_gen_expr_test_new.shape

((55371, 691, 2), (13843, 691, 2), (17316, 691, 2))

In [114]:
copy_number_train = omics_gen_copy_number_gen_expr_train_new[:,:,0:1]
copy_number_valid = omics_gen_copy_number_gen_expr_valid_new[:,:,0:1]
copy_number_test = omics_gen_copy_number_gen_expr_test_new[:,:,0:1]

In [115]:
copy_number_train.shape, copy_number_valid.shape, copy_number_test.shape

((55371, 691, 1), (13843, 691, 1), (17316, 691, 1))

In [116]:
gene_expr_train = omics_gen_copy_number_gen_expr_train_new[:,:,1:2]
gene_expr_valid = omics_gen_copy_number_gen_expr_valid_new[:,:,1:2]
gene_expr_test = omics_gen_copy_number_gen_expr_test_new[:,:,1:2]

In [117]:
gene_expr_train.shape, gene_expr_valid.shape, gene_expr_test.shape

((55371, 691, 1), (13843, 691, 1), (17316, 691, 1))

In [118]:
valid_items = [[ valid_gcn_feats, valid_adj_list,
                           copy_number_valid, gene_expr_valid], y_valid]

In [119]:
with open("..//data//valid_items.pickle", "wb") as f:
    pickle.dump(valid_items, f)

In [120]:
input_gcn_features = tf.keras.layers.Input(shape = (100, 75))
input_norm_adj_mat = tf.keras.layers.Input(shape = (100, 100))
mult_1 = tf.keras.layers.Dot(1)([input_norm_adj_mat, input_gcn_features])
dense_layer_gcn = tf.keras.layers.Dense(256, activation = "relu")
dense_out = dense_layer_gcn(mult_1)
dense_out = tf.keras.layers.BatchNormalization()(dense_out)
dense_out = tf.keras.layers.Dropout(0.1)(dense_out)
mult_2 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
dense_layer_gcn = tf.keras.layers.Dense(128, activation = "relu")
dense_out = dense_layer_gcn(mult_2)
dense_out = tf.keras.layers.BatchNormalization()(dense_out)
dense_out = tf.keras.layers.Dropout(0.1)(dense_out)

# dense_layer_gcn = tf.keras.layers.Dense(100, activation = "relu")
# mult_3 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
# dense_out = dense_layer_gcn(mult_3)
# dense_out = tf.keras.layers.BatchNormalization()(dense_out)
# dense_out = tf.keras.layers.Dropout(0.2)(dense_out)

dense_out = tf.keras.layers.GlobalAvgPool1D()(dense_out)

In [121]:
# here is the code for CNV and gene expression
dropout1 = 0.10
dropout2 = 0.20
# first add the CNV
input_cnv = tf.keras.layers.Input(shape = (omics_gen_expr_train.shape[1],1))
    
l1 = tf.keras.layers.Dense(32)(input_cnv)
l1 = tf.keras.layers.Dropout(dropout1)(l1)
l2 = tf.keras.layers.Dense(128)(l1)
l2 = tf.keras.layers.Dropout(dropout1)(l2)
    
dense_layer_gcn1 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_cnv = dense_layer_gcn1(l2)
dense_out_cnv = tf.keras.layers.BatchNormalization()(dense_out_cnv)
dense_out_cnv = tf.keras.layers.Dropout(dropout1)(dense_out_cnv)
# mult_21 = tf.keras.layers.Dot(1)([const_input, dense_out1])
dense_layer_gcn1 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_cnv = dense_layer_gcn1(dense_out_cnv)
dense_out_cnv = tf.keras.layers.BatchNormalization()(dense_out_cnv)
dense_out_cnv = tf.keras.layers.Dropout(dropout1)(dense_out_cnv)
dense_layer_gcn1 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_cnv = dense_layer_gcn1(dense_out_cnv)
dense_out_cnv = tf.keras.layers.BatchNormalization()(dense_out_cnv)
dense_out_cnv = tf.keras.layers.Dropout(dropout1)(dense_out_cnv)
dense_layer_gcn1 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_cnv = dense_layer_gcn1(dense_out_cnv)
dense_out_cnv = tf.keras.layers.BatchNormalization()(dense_out_cnv)
dense_out_cnv = tf.keras.layers.Dropout(dropout1)(dense_out_cnv)
dense_out_cnv = tf.keras.layers.GlobalAvgPool1D()(dense_out_cnv)

# now add the gene expr
input_gene_expr = tf.keras.layers.Input(shape = (omics_gen_expr_train.shape[1],1))
    
l11 = tf.keras.layers.Dense(32)(input_gene_expr)
l11 = tf.keras.layers.Dropout(dropout1)(l11)
l21 = tf.keras.layers.Dense(128)(l11)
l21 = tf.keras.layers.Dropout(dropout1)(l21)
    
dense_layer_gcn2 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_expr = dense_layer_gcn2(l21)
dense_out_expr = tf.keras.layers.BatchNormalization()(dense_out_expr)
dense_out_expr = tf.keras.layers.Dropout(dropout1)(dense_out_expr)
# mult_21 = tf.keras.layers.Dot(1)([const_input, dense_out1])
dense_layer_gcn2 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_expr = dense_layer_gcn2(dense_out_expr)
dense_out_expr = tf.keras.layers.BatchNormalization()(dense_out_expr)
dense_out_expr = tf.keras.layers.Dropout(dropout1)(dense_out_expr)
dense_layer_gcn2 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_expr = dense_layer_gcn2(dense_out_expr)
dense_out_expr = tf.keras.layers.BatchNormalization()(dense_out_expr)
dense_out_expr = tf.keras.layers.Dropout(dropout1)(dense_out_expr)
dense_layer_gcn2 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_expr = dense_layer_gcn2(dense_out_expr)
dense_out_expr = tf.keras.layers.BatchNormalization()(dense_out_expr)
dense_out_expr = tf.keras.layers.Dropout(dropout1)(dense_out_expr)
dense_out_expr = tf.keras.layers.GlobalAvgPool1D()(dense_out_expr)

In [122]:
all_omics = tf.keras.layers.Concatenate()([ dense_out_cnv, dense_out_expr, dense_out])

In [123]:
x = tf.keras.layers.Dense(256,activation = 'tanh')(all_omics)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(128,activation = 'tanh')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(10,activation = 'tanh')(x)

In [124]:
final_out_layer = tf.keras.layers.Dense(1)

In [125]:
final_out = final_out_layer(x)

In [126]:
simplegcn = tf.keras.models.Model([input_gcn_features, input_norm_adj_mat, input_cnv, input_gene_expr], final_out)

In [127]:
simplegcn.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 691, 1)]             0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 691, 1)]             0         []                            
                                                                                                  
 dense_2 (Dense)             (None, 691, 32)              64        ['input_3[0][0]']             
                                                                                                  
 dense_8 (Dense)             (None, 691, 32)              64        ['input_4[0][0]']             
                                                                                              

In [128]:
simplegcn.compile(loss = tf.keras.losses.MeanSquaredError(), 
                    optimizer = tf.keras.optimizers.Adam(lr=0.001), 
                    metrics = [tf.keras.metrics.RootMeanSquaredError()])



In [129]:
%%time
history = simplegcn.fit([train_gcn_feats, train_adj_list,
                         copy_number_train,gene_expr_train], y_train, 
                         
          batch_size = 512, epochs = 1000, verbose = 1,
                         
          validation_data=([[ valid_gcn_feats, valid_adj_list,
                           copy_number_valid, gene_expr_valid], y_valid]),
                         

        callbacks = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 20, restore_best_weights=True,
                                                       mode = "min"), 
         validation_batch_size = 512, shuffle = True)

Epoch 1/1000


2024-07-11 16:45:54.691523: I external/local_xla/xla/service/service.cc:168] XLA service 0x146a79a10fa0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-11 16:45:54.691566: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Tesla V100S-PCIE-32GB, Compute Capability 7.0
2024-07-11 16:45:54.725810: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-11 16:45:54.801654: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1720734354.999532   65908 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


In [130]:
simplegcn.save("..//models//dualgcn_trained_on_domain")

INFO:tensorflow:Assets written to: ..//models//dualgcn_trained_on_domain/assets


INFO:tensorflow:Assets written to: ..//models//dualgcn_trained_on_domain/assets
