In [143]:
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow import keras
from tensorflow.python.keras import backend as K
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp
import random

In [2]:
drugs_cell_lines_ic50_df = pd.read_csv("data//drugs_cell_lines_ic50.csv")

In [3]:
drugs_cell_lines_ic50_df.shape

(208734, 3)

In [4]:
drugs_cell_lines_ic50_df.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,IC50
0,1001,ACH-002137,7.258918
1,1004,ACH-002137,-3.802467
2,1005,ACH-002137,4.146364
3,1006,ACH-002137,3.171367
4,1007,ACH-002137,-4.959442


In [5]:
pubchem_drugs_smiles_df = pd.read_csv('data//drugs_smile_strings.csv')

In [6]:
pubchem_drugs_smiles_df.head()

Unnamed: 0,drug_id,Smiles
0,1242,COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O
1,179,O=c1[nH]cc(F)c(=O)[nH]1
2,86,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4c[nH]c5ccccc45)...
3,55,COc1cc(-c2nn(C3CCC(N4CCN(C(C)=O)CC4)CC3)c3ncnc...
4,1001,NC(=O)c1ncn(C2OC(COP(=O)(O)O)C(O)C2O)c1N


In [7]:
pubchem_drugs_smiles_df.shape

(238, 2)

In [8]:
drugs_smiles_cell_lines_ic50_df = pd.merge(drugs_cell_lines_ic50_df, pubchem_drugs_smiles_df, 
                                             on = "drug_id")

In [9]:
drugs_smiles_cell_lines_ic50_df = drugs_smiles_cell_lines_ic50_df[["drug_id", "Cancer_Cell_Line", "Smiles", "IC50"]]

In [10]:
drugs_smiles_cell_lines_ic50_df.dtypes

drug_id               int64
Cancer_Cell_Line     object
Smiles               object
IC50                float64
dtype: object

In [11]:
drugs_smiles_cell_lines_ic50_df["drug_id"] = drugs_smiles_cell_lines_ic50_df["drug_id"].astype(object)

In [14]:
with open("data//drug_gcn_features.pickle", "rb") as f:
    dict_features = pickle.load(f)

In [15]:
with open("data//drug_gcn_normalized_adj_mats.pickle", "rb") as f:
    dict_normalized_adj_mats = pickle.load(f)

In [16]:
dualgcn_train = pd.read_csv("data//DualGCN_Embedding_train.csv")

In [17]:
dualgcn_test = pd.read_csv("data//DualGCN_Embedding_test.csv")

In [18]:
pubchem_to_drugs_df = pd.read_csv('data//1.Drug_listMon Jun 24 09_00_55 2019.csv')

In [19]:
pubchem_to_drugs_df.head()

Unnamed: 0,drug_id,Name,Synonyms,Targets,Target pathway,PubCHEM,Sample Size,Count
0,1242,(5Z)-7-Oxozeaenol,"5Z-7-Oxozeaenol, LL-Z1640-2",TAK1,"Other, kinases",9863776,945,266
1,179,5-Fluorouracil,5-FU,Antimetabolite (DNA & RNA),Other,3385,968,266
2,86,A-443654,KIN001-139,"AKT1, AKT2, AKT3",PI3K/MTOR signaling,10172943,425,266
3,55,A-770041,KIN001-111,"LCK, FYN","Other, kinases",9549184,426,266
4,1001,AICA Ribonucleotide,"AICAR, N1-(b-D-Ribofuranosyl)-5-aminoimidazole...",AMPK agonist,Metabolism,65110,872,266


In [20]:
pubchem_to_drugs_df.shape

(266, 8)

In [21]:
pubchem_to_drugs_df = pubchem_to_drugs_df[["drug_id", "PubCHEM"]]

In [22]:
pubchem_to_drugs_df.dtypes

drug_id     int64
PubCHEM    object
dtype: object

In [25]:
pubchem_to_drugs_df["PubCHEM"] = [val if str(val).isdigit() else np.nan for val in pubchem_to_drugs_df["PubCHEM"] ]

In [26]:
pubchem_to_drugs_df = pubchem_to_drugs_df.dropna()

In [27]:
pubchem_to_drugs_df.dtypes

drug_id     int64
PubCHEM    object
dtype: object

In [28]:
pubchem_to_drugs_df.shape

(238, 2)

In [29]:
pubchem_to_drugs_df["drug_id"] = pubchem_to_drugs_df["drug_id"].astype(str)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x_train, x_valid, y_train, y_valid = train_test_split(drugs_smiles_cell_lines_ic50_df.drop(["IC50"],1), drugs_smiles_cell_lines_ic50_df["IC50"].values, 
                                                     test_size = 0.20, random_state = 42)

  x_train, x_valid, y_train, y_valid = train_test_split(drugs_smiles_cell_lines_ic50_df.drop(["IC50"],1), drugs_smiles_cell_lines_ic50_df["IC50"].values,


In [32]:
dualgcn_train["Drug_ID"] = dualgcn_train["Drug_ID"].astype(str)

In [33]:
dualgcn_test["Drug_ID"] = dualgcn_test["Drug_ID"].astype(str)

In [34]:
pubchem_to_drugs_df.dtypes

drug_id    object
PubCHEM    object
dtype: object

In [35]:
dualgcn_train = pubchem_to_drugs_df.merge(dualgcn_train, left_on = ["PubCHEM"], right_on = ["Drug_ID"])

In [36]:
dualgcn_train = dualgcn_train[['Cell_Line', 'drug_id']]

In [37]:
dualgcn_train.head()

Unnamed: 0,Cell_Line,drug_id
0,ACH-000070,1242
1,ACH-000105,1242
2,ACH-000981,1242
3,ACH-000061,1242
4,ACH-000995,1242


In [38]:
dualgcn_test = pubchem_to_drugs_df.merge(dualgcn_test, left_on = ["PubCHEM"], right_on = ["Drug_ID"])

In [39]:
dualgcn_test.head()

Unnamed: 0,drug_id,PubCHEM,Cell_Line,Drug_ID,Target,TCGA_Type,DrugEmbedding_Dim0,DrugEmbedding_Dim1,DrugEmbedding_Dim2,DrugEmbedding_Dim3,...,CellEmbedding_Dim246,CellEmbedding_Dim247,CellEmbedding_Dim248,CellEmbedding_Dim249,CellEmbedding_Dim250,CellEmbedding_Dim251,CellEmbedding_Dim252,CellEmbedding_Dim253,CellEmbedding_Dim254,CellEmbedding_Dim255
0,1242,9863776,ACH-000020,9863776,1.658465,ALL,-0.112977,-0.34584,0.000926,-0.024855,...,-0.176026,0.0543,-0.560812,-0.504415,-0.20714,-0.064926,-0.206338,-0.088848,-0.18441,-0.059895
1,1242,9863776,ACH-000142,9863776,-0.321683,BLCA,-0.112977,-0.34584,0.000926,-0.024855,...,-0.129487,0.0543,2.323304,0.499483,-0.20714,-0.064926,-0.206338,-0.088848,-0.18441,-0.025697
2,1242,9863776,ACH-000547,9863776,0.950017,BLCA,-0.112977,-0.34584,0.000926,-0.024855,...,0.242312,0.057235,1.133223,0.433553,-0.203844,-0.062081,-0.203995,-0.085297,-0.181583,-0.032253
3,1242,9863776,ACH-000242,9863776,1.190755,BLCA,-0.112977,-0.34584,0.000926,-0.024855,...,-0.175997,0.0543,2.544746,0.372183,-0.20714,-0.064926,-0.206338,-0.088848,-0.18441,0.0173
4,1242,9863776,ACH-000018,9863776,0.408545,BLCA,-0.112977,-0.34584,0.000926,-0.024855,...,-0.176026,0.0543,0.922048,-0.504415,-0.20714,-0.064926,-0.206338,-0.088848,-0.18441,-0.059895


In [40]:
dualgcn_test = dualgcn_test[['Cell_Line', 'drug_id']]

In [41]:
dualgcn_test.head()

Unnamed: 0,Cell_Line,drug_id
0,ACH-000020,1242
1,ACH-000142,1242
2,ACH-000547,1242
3,ACH-000242,1242
4,ACH-000018,1242


In [42]:
dualgcn_train.dtypes

Cell_Line    object
drug_id      object
dtype: object

In [44]:
x_train.dtypes

drug_id             object
Cancer_Cell_Line    object
Smiles              object
dtype: object

In [50]:
x_train['drug_id'] = x_train['drug_id'].astype(str)

In [51]:
x_valid['drug_id'] = x_valid['drug_id'].astype(str)

In [52]:
x_train_valid_feats = pd.concat([x_train, x_valid], ignore_index = True)

In [53]:
y_train_valid = pd.concat([pd.DataFrame(y_train.reshape(-1,1)), pd.DataFrame(y_valid.reshape(-1,1))], ignore_index = True)

In [54]:
combo_train_valid = pd.concat([x_train_valid_feats, y_train_valid], 1)

  combo_train_valid = pd.concat([x_train_valid_feats, y_train_valid], 1)


In [55]:
combo_train_valid.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,Smiles,0
0,1059,ACH-001368,COc1ccc(-c2ccc3c(N4CCOCC4C)nc(N4CCOCC4C)nc3n2)...,1.086644
1,255,ACH-000142,COCC(=O)NCC=Cc1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)c(...,4.2278
2,178,ACH-000218,COc1ccc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)cc1OC,0.067505
3,152,ACH-002155,COc1cc2ncnc(-n3nc(-c4ccccn4)nc3N)c2cc1OC,4.89455
4,1243,ACH-000277,COc1cc(C=CC(=O)N2CCC=CC2=O)cc(OC)c1OC,4.572953


In [56]:
combo_train_valid.columns = ['drug_id', 'Cancer_Cell_Line', 'Smiles', 'IC50']

In [57]:
# filter x_train x _valid here
x_y_train = combo_train_valid.merge(dualgcn_train, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])

In [58]:
x_y_train.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,Smiles,IC50,Cell_Line
0,255,ACH-000142,COCC(=O)NCC=Cc1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)c(...,4.2278,ACH-000142
1,178,ACH-000218,COc1ccc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)cc1OC,0.067505,ACH-000218
2,1230,ACH-000090,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,0.717946,ACH-000090
3,167,ACH-000348,NCC(=O)Nc1ccc(-n2nc(C(F)(F)F)cc2-c2ccc3c(ccc4c...,1.902171,ACH-000348
4,1372,ACH-000914,CC(=O)Nc1cccc(-n2c(=O)n(C3CC3)c(=O)c3c(Nc4ccc(...,0.878946,ACH-000914


In [59]:
x_y_train.shape

(69214, 5)

In [60]:
x_y_test = combo_train_valid.merge(dualgcn_test, left_on = ['Cancer_Cell_Line','drug_id'], right_on = [ 'Cell_Line','drug_id'])

In [61]:
x_y_test.shape

(17316, 5)

In [62]:
x_train, x_valid, y_train, y_valid = train_test_split(x_y_train.drop(["IC50", 'Cell_Line'],1),x_y_train["IC50"].values, random_state = 42, test_size = 0.2)

  x_train, x_valid, y_train, y_valid = train_test_split(x_y_train.drop(["IC50", 'Cell_Line'],1),x_y_train["IC50"].values, random_state = 42, test_size = 0.2)


In [63]:
print("x train shape", x_train.shape)
print("x valid shape", x_valid.shape)
print("y train shape", y_train.shape)
print("y valid shape", y_valid.shape)

x train shape (55371, 3)
x valid shape (13843, 3)
y train shape (55371,)
y valid shape (13843,)


In [64]:
x_test, y_test = x_y_test.drop(["IC50", 'Cell_Line'], 1), x_y_test["IC50"].values

  x_test, y_test = x_y_test.drop(["IC50", 'Cell_Line'], 1), x_y_test["IC50"].values


In [65]:
print("x test shape", x_test.shape)
print("y test shape", y_test.shape)

x test shape (17316, 3)
y test shape (17316,)


In [66]:
x_train.head()

Unnamed: 0,drug_id,Cancer_Cell_Line,Smiles
54743,1373,ACH-000694,CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2...
13257,29,ACH-000440,Cc1ccc(NC(=O)c2cccc(C(C)(C)C#N)c2)cc1Nc1ccc2nc...
37343,1069,ACH-000828,Cl.Cl.O=c1cc(CN2CCOCC2)occ1OCCCCCSc1ccnc2cc(C(...
33226,1011,ACH-000504,CC1(C)CCC(c2ccc(Cl)cc2)=C(CN2CCN(c3ccc(C(=O)NS...
9313,274,ACH-000290,O=C(C=Cc1cccc(S(=O)(=O)Nc2ccccc2)c1)NO


In [67]:
train_gcn_feats = []
train_adj_list = []
for drug_id in x_train["drug_id"].values:
    train_gcn_feats.append(dict_features[drug_id])
    train_adj_list.append(dict_normalized_adj_mats[drug_id])

In [68]:
valid_gcn_feats = []
valid_adj_list = []
for drug_id in x_valid["drug_id"].values:
    valid_gcn_feats.append(dict_features[drug_id])
    valid_adj_list.append(dict_normalized_adj_mats[drug_id])

In [69]:
test_gcn_feats = []
test_adj_list = []
for drug_id in x_test["drug_id"].values:
    test_gcn_feats.append(dict_features[drug_id])
    test_adj_list.append(dict_normalized_adj_mats[drug_id])

In [70]:
import numpy as np

In [71]:
%%time
train_gcn_feats = np.array(train_gcn_feats).astype("float16")
valid_gcn_feats = np.array(valid_gcn_feats).astype("float16")
test_gcn_feats = np.array(test_gcn_feats).astype("float16")

CPU times: user 2.22 s, sys: 692 ms, total: 2.91 s
Wall time: 2.92 s


In [73]:
%%time
train_adj_list = np.array(train_adj_list).astype("float16")
valid_adj_list = np.array(valid_adj_list).astype("float16")
test_adj_list = np.array(test_adj_list).astype("float16")

CPU times: user 2.89 s, sys: 898 ms, total: 3.79 s
Wall time: 3.8 s


In [75]:
# load models
# omic models
cancer_copy_number_model = tf.keras.models.load_model("..//Joint_Learner//Models//cancer_copy_number_model_no_norm_common")
cancer_cell_gen_expr_model = tf.keras.models.load_model("..//Joint_Learner//Models//cancer_cell_gen_expr_model_no_norm_common")
cancer_cell_gen_methy_model = tf.keras.models.load_model("..//Joint_Learner//Models//cancer_cell_gen_methy_model_no_norm")
cancer_cell_gen_mut_model = tf.keras.models.load_model("..//Joint_Learner//Models//cancer_cell_gen_mut_model_no_norm")

2023-11-28 16:20:43.533448: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-28 16:20:45.315394: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30958 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:06:00.0, compute capability: 7.0


















In [76]:
# load models
# drug models
pubchem_drugs_rdkit_model = tf.keras.models.load_model("..//Joint_Learner//Models//pubchem_drugs_rdkit_model_no_norm")





In [79]:
std = StandardScaler()

In [80]:
# extract drug features - does not seem like these are used in the network
drug_features_train = pubchem_drugs_rdkit_model(x_train["drug_id"].values).numpy().astype("float32")
drug_features_valid = pubchem_drugs_rdkit_model(x_valid["drug_id"].values).numpy().astype("float32")
drug_features_test = pubchem_drugs_rdkit_model(x_test["drug_id"].values).numpy().astype("float32")

# drug_features_train = pubchem_drugs_rdkit_model(x_train["drug_id"].values)
# drug_features_valid = pubchem_drugs_rdkit_model(x_valid["drug_id"].values)

In [81]:
drug_features_train.shape, drug_features_valid.shape, drug_features_test.shape

((55371, 106), (13843, 106), (17316, 106))

In [82]:
np.isinf(drug_features_train).sum()

0

In [83]:
drug_features_train = std.fit_transform(drug_features_train)

In [84]:
drug_features_valid = std.transform(drug_features_valid)
drug_features_test = std.transform(drug_features_test)

In [85]:
# extract copy number features
omics_copy_number_train = cancer_copy_number_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_copy_number_valid = cancer_copy_number_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_copy_number_test = cancer_copy_number_model(x_test["Cancer_Cell_Line"].values).numpy().astype("float16")

In [86]:
omics_copy_number_train.shape, omics_copy_number_valid.shape, omics_copy_number_test.shape

((55371, 691), (13843, 691), (17316, 691))

In [87]:
# extract gen expr features
omics_gen_expr_train = cancer_cell_gen_expr_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_expr_valid = cancer_cell_gen_expr_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_expr_test = cancer_cell_gen_expr_model(x_test["Cancer_Cell_Line"].values).numpy().astype("float16")

In [88]:
omics_gen_expr_train.shape, omics_gen_expr_valid.shape, omics_gen_expr_test.shape

((55371, 691), (13843, 691), (17316, 691))

In [89]:
omics_gen_copy_number_gen_expr_train = np.concatenate([np.expand_dims(omics_copy_number_train, -1),
                                                      np.expand_dims(omics_gen_expr_train, -1)], axis = -1)

In [90]:
omics_gen_copy_number_gen_expr_train.shape

(55371, 691, 2)

In [91]:
omics_gen_copy_number_gen_expr_valid = np.concatenate([np.expand_dims(omics_copy_number_valid, -1),
                                                      np.expand_dims(omics_gen_expr_valid, -1)], axis = -1)

In [92]:
omics_gen_copy_number_gen_expr_valid.shape

(13843, 691, 2)

In [93]:
omics_gen_copy_number_gen_expr_test = np.concatenate([np.expand_dims(omics_copy_number_test, -1),
                                                      np.expand_dims(omics_gen_expr_test, -1)], axis = -1)

In [94]:
omics_gen_copy_number_gen_expr_test.shape

(17316, 691, 2)

In [95]:
# extract gen methylation features
omics_gen_methyl_train = cancer_cell_gen_methy_model(x_train["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_methyl_valid = cancer_cell_gen_methy_model(x_valid["Cancer_Cell_Line"].values).numpy().astype("float16")
omics_gen_methyl_test = cancer_cell_gen_methy_model(x_test["Cancer_Cell_Line"].values).numpy().astype("float16")

In [96]:
omics_gen_methyl_train.shape, omics_gen_methyl_valid.shape, omics_gen_methyl_test.shape

((55371, 808), (13843, 808), (17316, 808))

In [97]:
# extract gen mutation features
with tf.device('/cpu:0'):
    omics_gen_mut_train = cancer_cell_gen_mut_model.predict(x_train["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float16")
    omics_gen_mut_valid = cancer_cell_gen_mut_model.predict(x_valid["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float16")
    omics_gen_mut_test = cancer_cell_gen_mut_model.predict(x_test["Cancer_Cell_Line"].values, verbose = 1, batch_size = 256).astype("float16")



In [98]:
omics_gen_mut_train.shape, omics_gen_mut_valid.shape, omics_gen_mut_test.shape

((55371, 34673), (13843, 34673), (17316, 34673))

In [99]:
smile_strings_train = x_train["Smiles"].values.reshape(-1,1)
smile_strings_valid = x_valid["Smiles"].values.reshape(-1,1)
smile_strings_test = x_test["Smiles"].values.reshape(-1,1)

In [100]:
smile_strings_train.shape, smile_strings_valid.shape, smile_strings_test.shape

((55371, 1), (13843, 1), (17316, 1))

In [101]:
selected_info_common_cell_lines = "data//cellline_list.txt"
selected_info_common_genes = "data//gene_list.txt"

In [102]:
PPI_file = "data//PPI_network.txt"

In [103]:
with open(selected_info_common_cell_lines) as f:
    common_cell_lines = [item.strip() for item in f.readlines()]

In [104]:
len(common_cell_lines)

525

In [105]:
with open("data//common_genes.pickle", "rb") as f:
    common_genes = pickle.load(f)

In [106]:
len(common_genes)

691

In [107]:
len(np.unique(common_genes))

691

In [108]:
idx_dic={}
for index, item in enumerate(common_genes):
    idx_dic[item] = index

In [109]:
idx_dic['SUFU']

1

In [110]:
ppi_adj_info = [[] for item in common_genes] 

In [111]:
# will return for each gene what other gene is connected - PPIs
ppi_adj_info = [[] for item in common_genes] 
for line in open(PPI_file).readlines():
    gene1,gene2 = line.split('\t')[0],line.split('\t')[1]
    if (gene1 in common_genes) & (gene2 in common_genes):
        if idx_dic[gene1]<=idx_dic[gene2]:
            ppi_adj_info[idx_dic[gene1]].append(idx_dic[gene2])
            ppi_adj_info[idx_dic[gene2]].append(idx_dic[gene1])

In [112]:
len(ppi_adj_info)

691

In [113]:
len(ppi_adj_info[0])

247

In [114]:
len(common_genes)

691

In [115]:
def CelllineGraphAdjNorm(ppi_adj_info,common_genes = common_genes):
    # with open(selected_info_common_genes) as f:
    #     common_genes = [item.strip() for item in f.readlines()]
    nb_nodes = len(common_genes)
    adj_mat = np.zeros((nb_nodes,nb_nodes),dtype='float32')
    # print(adj_mat.shape)
    for i in range(len(ppi_adj_info)):
        # print(i)
        nodes = ppi_adj_info[i]
        for each in nodes:
            adj_mat[i,each] = 1

    # for checking if two sparse matrices are the same
    assert np.allclose(adj_mat,adj_mat.T)
    norm_adj = NormalizeAdj(adj_mat)
    return norm_adj 

In [116]:
def NormalizeAdj(adj):
    adj = adj + np.eye(adj.shape[0])
    d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0).toarray()
    a_norm = adj.dot(d).transpose().dot(d)
    return a_norm

In [118]:
ppi_adj = CelllineGraphAdjNorm(ppi_adj_info,common_genes)

In [119]:
ppi_adj.shape

(691, 691)

In [120]:
ppi_adj

array([[0.00403226, 0.00546522, 0.00684739, ..., 0.        , 0.003636  ,
        0.00582104],
       [0.00546522, 0.00740741, 0.        , ..., 0.        , 0.00492814,
        0.00788968],
       [0.00684739, 0.        , 0.01162791, ..., 0.        , 0.00617449,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.00510204, 0.        ,
        0.00654785],
       [0.003636  , 0.00492814, 0.00617449, ..., 0.        , 0.00327869,
        0.        ],
       [0.00582104, 0.00788968, 0.        , ..., 0.00654785, 0.        ,
        0.00840336]])

In [121]:
ppi_adj = np.expand_dims(ppi_adj,0)

In [122]:
ppi_adj.shape

(1, 691, 691)

In [123]:
omics_gen_copy_number_gen_expr_train_new = (ppi_adj@omics_gen_copy_number_gen_expr_train)
omics_gen_copy_number_gen_expr_valid_new = (ppi_adj@omics_gen_copy_number_gen_expr_valid)
omics_gen_copy_number_gen_expr_test_new = (ppi_adj@omics_gen_copy_number_gen_expr_test)

In [124]:
omics_gen_copy_number_gen_expr_train_new.shape, omics_gen_copy_number_gen_expr_valid_new.shape, omics_gen_copy_number_gen_expr_test_new.shape

((55371, 691, 2), (13843, 691, 2), (17316, 691, 2))

In [125]:
copy_number_train = omics_gen_copy_number_gen_expr_train_new[:,:,0:1]
copy_number_valid = omics_gen_copy_number_gen_expr_valid_new[:,:,0:1]
copy_number_test = omics_gen_copy_number_gen_expr_test_new[:,:,0:1]

In [126]:
copy_number_train.shape, copy_number_valid.shape, copy_number_test.shape

((55371, 691, 1), (13843, 691, 1), (17316, 691, 1))

In [127]:
gene_expr_train = omics_gen_copy_number_gen_expr_train_new[:,:,1:2]
gene_expr_valid = omics_gen_copy_number_gen_expr_valid_new[:,:,1:2]
gene_expr_test = omics_gen_copy_number_gen_expr_test_new[:,:,1:2]

In [128]:
gene_expr_train.shape, gene_expr_valid.shape, gene_expr_test.shape

((55371, 691, 1), (13843, 691, 1), (17316, 691, 1))

In [129]:
valid_items = [[ valid_gcn_feats, valid_adj_list,
                           copy_number_valid, gene_expr_valid], y_valid]

In [130]:
with open("data//valid_items.pickle", "wb") as f:
    pickle.dump(valid_items, f)

In [131]:
input_gcn_features = tf.keras.layers.Input(shape = (100, 75))
input_norm_adj_mat = tf.keras.layers.Input(shape = (100, 100))
mult_1 = tf.keras.layers.Dot(1)([input_norm_adj_mat, input_gcn_features])
dense_layer_gcn = tf.keras.layers.Dense(256, activation = "relu")
dense_out = dense_layer_gcn(mult_1)
dense_out = tf.keras.layers.BatchNormalization()(dense_out)
dense_out = tf.keras.layers.Dropout(0.1)(dense_out)
mult_2 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
dense_layer_gcn = tf.keras.layers.Dense(128, activation = "relu")
dense_out = dense_layer_gcn(mult_2)
dense_out = tf.keras.layers.BatchNormalization()(dense_out)
dense_out = tf.keras.layers.Dropout(0.1)(dense_out)

# dense_layer_gcn = tf.keras.layers.Dense(100, activation = "relu")
# mult_3 = tf.keras.layers.Dot(1)([input_norm_adj_mat, dense_out])
# dense_out = dense_layer_gcn(mult_3)
# dense_out = tf.keras.layers.BatchNormalization()(dense_out)
# dense_out = tf.keras.layers.Dropout(0.2)(dense_out)

dense_out = tf.keras.layers.GlobalAvgPool1D()(dense_out)

In [133]:
# here is the code for CNV and gene expression
dropout1 = 0.10
dropout2 = 0.20
# first add the CNV
input_cnv = tf.keras.layers.Input(shape = (omics_gen_expr_train.shape[1],1))
    
l1 = tf.keras.layers.Dense(32)(input_cnv)
l1 = tf.keras.layers.Dropout(dropout1)(l1)
l2 = tf.keras.layers.Dense(128)(l1)
l2 = tf.keras.layers.Dropout(dropout1)(l2)
    
dense_layer_gcn1 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_cnv = dense_layer_gcn1(l2)
dense_out_cnv = tf.keras.layers.BatchNormalization()(dense_out_cnv)
dense_out_cnv = tf.keras.layers.Dropout(dropout1)(dense_out_cnv)
# mult_21 = tf.keras.layers.Dot(1)([const_input, dense_out1])
dense_layer_gcn1 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_cnv = dense_layer_gcn1(dense_out_cnv)
dense_out_cnv = tf.keras.layers.BatchNormalization()(dense_out_cnv)
dense_out_cnv = tf.keras.layers.Dropout(dropout1)(dense_out_cnv)
dense_layer_gcn1 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_cnv = dense_layer_gcn1(dense_out_cnv)
dense_out_cnv = tf.keras.layers.BatchNormalization()(dense_out_cnv)
dense_out_cnv = tf.keras.layers.Dropout(dropout1)(dense_out_cnv)
dense_layer_gcn1 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_cnv = dense_layer_gcn1(dense_out_cnv)
dense_out_cnv = tf.keras.layers.BatchNormalization()(dense_out_cnv)
dense_out_cnv = tf.keras.layers.Dropout(dropout1)(dense_out_cnv)
dense_out_cnv = tf.keras.layers.GlobalAvgPool1D()(dense_out_cnv)

# now add the gene expr
input_gene_expr = tf.keras.layers.Input(shape = (omics_gen_expr_train.shape[1],1))
    
l11 = tf.keras.layers.Dense(32)(input_gene_expr)
l11 = tf.keras.layers.Dropout(dropout1)(l11)
l21 = tf.keras.layers.Dense(128)(l11)
l21 = tf.keras.layers.Dropout(dropout1)(l21)
    
dense_layer_gcn2 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_expr = dense_layer_gcn2(l21)
dense_out_expr = tf.keras.layers.BatchNormalization()(dense_out_expr)
dense_out_expr = tf.keras.layers.Dropout(dropout1)(dense_out_expr)
# mult_21 = tf.keras.layers.Dot(1)([const_input, dense_out1])
dense_layer_gcn2 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_expr = dense_layer_gcn2(dense_out_expr)
dense_out_expr = tf.keras.layers.BatchNormalization()(dense_out_expr)
dense_out_expr = tf.keras.layers.Dropout(dropout1)(dense_out_expr)
dense_layer_gcn2 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_expr = dense_layer_gcn2(dense_out_expr)
dense_out_expr = tf.keras.layers.BatchNormalization()(dense_out_expr)
dense_out_expr = tf.keras.layers.Dropout(dropout1)(dense_out_expr)
dense_layer_gcn2 = tf.keras.layers.Dense(256, activation = "relu")
dense_out_expr = dense_layer_gcn2(dense_out_expr)
dense_out_expr = tf.keras.layers.BatchNormalization()(dense_out_expr)
dense_out_expr = tf.keras.layers.Dropout(dropout1)(dense_out_expr)
dense_out_expr = tf.keras.layers.GlobalAvgPool1D()(dense_out_expr)

In [134]:
all_omics = tf.keras.layers.Concatenate()([ dense_out_cnv, dense_out_expr, dense_out])

In [136]:
x = tf.keras.layers.Dense(256,activation = 'tanh')(all_omics)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(128,activation = 'tanh')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(10,activation = 'tanh')(x)

In [137]:
final_out_layer = tf.keras.layers.Dense(1)

In [138]:
final_out = final_out_layer(x)

In [139]:
simplegcn = tf.keras.models.Model([input_gcn_features, input_norm_adj_mat, input_cnv, input_gene_expr], final_out)

In [140]:
simplegcn.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 691, 1)]     0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 691, 1)]     0           []                               
                                                                                                  
 dense_2 (Dense)                (None, 691, 32)      64          ['input_3[0][0]']                
                                                                                                  
 dense_8 (Dense)                (None, 691, 32)      64          ['input_4[0][0]']                
                                                                                              

In [141]:
simplegcn.compile(loss = tf.keras.losses.MeanSquaredError(), 
                    optimizer = tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), 
                    metrics = [tf.keras.metrics.RootMeanSquaredError()])

  super().__init__(name, **kwargs)


In [144]:
%%time
history = simplegcn.fit([train_gcn_feats, train_adj_list,
                         copy_number_train,gene_expr_train], y_train.reshape(-1,1), 
                         
          batch_size = 64, epochs = 5, verbose = 1,
                         
          validation_data=([[ valid_gcn_feats, valid_adj_list,
                           copy_number_valid, gene_expr_valid], y_valid]),
                         

        callbacks = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 20, restore_best_weights=True,
                                                       mode = "min"), 
         validation_batch_size = 512, shuffle = True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 3min 53s, sys: 16.5 s, total: 4min 10s
Wall time: 3min 18s


In [145]:
simplegcn.save("models//simple_gcn_new_splits")

INFO:tensorflow:Assets written to: models//simple_gcn_new_splits/assets


INFO:tensorflow:Assets written to: models//simple_gcn_new_splits/assets
