In [1]:
# Necessary imports
%load_ext autoreload
%autoreload 2\

import networkx as nx
import numpy as np
import tensorflow as tf

from keras import backend as K
from keras.layers import Dense
from keras.models import Model, Sequential


from TCGAMultiOmics.multiomics import MultiOmicsData
from moge.network.heterogeneous_network import HeterogeneousNetwork


Using TensorFlow backend.


#  Import network from file

In [2]:
import pickle

# WRITE
# with open('moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_network_test_05_val_01_seed_0.pickle', 'wb') as file:
#     pickle.dump(network, file)

# READ
with open('moge/data/lncRNA_miRNA_mRNA/lncRNA-miRNA-mRNA_network_disease_train.pickle', 'rb') as file:
# with open('moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_network_biogrid.pickle', 'rb') as file:
    network = pickle.load(file)
#     network.remove_extra_nodes()
#     network.node_list = network.all_nodes
#     node_list = network.node_list
network.G.number_of_edges()

1936450

In [3]:
# READ edgelists
with open('moge/data/lncRNA_miRNA_mRNA/lncRNA-miRNA-mRNA_network_disease_test.pickle', 'rb') as file:
    network_test = pickle.load(file)
    
with open('moge/data/lncRNA_miRNA_mRNA/lncRNA-miRNA-mRNA_network_disease_val.pickle', 'rb') as file:
    network_val = pickle.load(file)

print("network_test", network_test.G.number_of_edges())
print("network_val", network_val.G.number_of_edges())

network_test 124486
network_val 24897


# Training Source Target Graph Embedding

In [4]:
from keras.layers import Input, Conv1D, Lambda, Dot, Dense, Flatten, MaxPooling1D, Lambda, Convolution1D, Layer
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras import backend as K
from keras.utils import multi_gpu_model
import keras

from keras.optimizers import SGD, Adam, RMSprop
from keras.losses import binary_crossentropy
from keras.metrics import kullback_leibler_divergence, binary_crossentropy, binary_accuracy

from keras.utils import to_categorical

from keras.callbacks import TensorBoard

def W_init(shape, name=None):
    """Initialize weights as in paper"""
    values = np.random.normal(loc=0,scale=1e-2,size=shape)
    return K.variable(values,name=name)
#//TODO: figure out how to initialize layer biases in keras.
def b_init(shape, name=None):
    """Initialize bias as in paper"""
    values=np.random.normal(loc=0.5,scale=1e-2,size=shape)
    return K.variable(values,name=name)

In [5]:
K.clear_session()
tf.reset_default_graph()
# sess.close()
# sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
sess = tf.InteractiveSession()

In [6]:
# INPUT PARAMETERS
max_length = 700
input_shape = (None, 6)
batch_size = 1024

_d = 512

In [7]:
# Inputs
E_ij = Input(batch_shape=(batch_size, 1), name="E_ij")
input_seq_i = Input(batch_shape=(batch_size, *input_shape), name="input_seq_i")
input_seq_j = Input(batch_shape=(batch_size, *input_shape), name="input_seq_j")
is_directed = Input(batch_shape=(batch_size, 1), dtype=tf.bool, name="is_directed")


In [8]:
# Distance functions
def euclidean_distance(inputs):
    x, y = inputs
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

def switch_emb(inputs):
    emb_i, emb_j, is_directed = inputs
    return K.switch(is_directed, 
                    [emb_i[:, 0:int(_d/2)] - emb_j[:, int(_d/2):_d]], [emb_i, emb_j]) 

def st_euclidean_distance(inputs):
    emb_i, emb_j, is_directed = inputs
    sum_directed = K.sum(K.square(emb_i[:, 0:int(_d/2)] - emb_j[:, int(_d/2):_d]), axis=1, keepdims=True)
    sum_undirected = K.sum(K.square(emb_i - emb_j), axis=1, keepdims=True)
    sum_switch = K.switch(is_directed, sum_directed, sum_undirected)
    return K.sqrt(K.maximum(sum_switch, K.epsilon()))

def st_embedding_probability(inputs):
    emb_i, emb_j, is_directed = inputs
    dot_directed = Dot(axes=1)([emb_i[:, 0:int(_d/2)], emb_j[:, int(_d/2):_d]])
    dot_undirected = Dot(axes=1)([emb_i, emb_j])
    return K.switch(is_directed, K.sigmoid(dot_directed), K.sigmoid(dot_undirected))

def st_embedding_probability_w_dense(inputs):
    emb_i, emb_j, is_directed = inputs
    directed = Dense(1, activation='sigmoid')(Dot(axes=1)([emb_i[:, 0:int(_d/2)], emb_j[:, int(_d/2):_d]]))
    undirected = Dense(1, activation='sigmoid')(Dot(axes=1)([emb_i, emb_j]))
    return K.switch(is_directed, directed, undirected)

def st_l1_distance(inputs):
    emb_i, emb_j, is_directed = inputs
    L1_layer = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))
    directed_distance = Dense(1, activation='sigmoid')(L1_layer([emb_i[:, 0:int(_d/2)], emb_j[:, int(_d/2):_d]]))
    undirected_distance = Dense(1, activation='sigmoid')(L1_layer([emb_i, emb_j]))
    
    return K.switch(is_directed, directed_distance, undirected_distance)

In [9]:
from keras.layers import LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

def create_base_network(input_shape):
    """ Base network to be shared (eq. to feature extraction).
    """
    input = Input(shape=input_shape)
#     x = Flatten()(input)
    x = Convolution1D(filters=320, kernel_size=20, input_shape=input_shape, activation='relu')(input)
    print("conv1d_1", x)
    x = MaxPooling1D(pool_size=10, padding="same")(x)
    print("max pooling_1", x)
    
#     x = Convolution1D(filters=320, kernel_size=4, activation='relu')(x)
#     print("conv1d_2", x)
#     x = MaxPooling1D(pool_size=13, padding="same")(x)
#     print("max pooling_2", x)
    
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(320, return_sequences=False, return_state=False))(x)
    print("brnn", x)
    x = Dropout(0.5)(x)
    
    x = Dense(75*640, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(925, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(_d, activation='linear')(x) # Embedding space
    print("embedding", x)
    return Model(input, x)

In [10]:
# Loss function
def contrastive_loss(y_true, y_pred):
    ''' Contrastive loss from Hadsell-et-al.'06 
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1.0
    return K.mean(y_true * K.square(y_pred) +
                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def regularized_cross_entropy(y_true, y_pred):
    return K.mean(y_true * K.log(y_pred) + (1 - y_true) * K.log(1 - y_pred))

def kl_loss(y_true, y_pred):
    return -K.mean(y_true * K.log(y_pred))

# Metrics
def accuracy(y_true, y_pred):
    ''' Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

In [11]:
# build create_base_network to use in each siamese 'leg'

with tf.device("/cpu:0"):
    lstm_network = create_base_network(input_shape=input_shape)

    print("lstm_network.input", lstm_network.input)
    print("lstm_network.output", lstm_network.output)
    print("input_seq_i", input_seq_i)
    print("input_seq_j", input_seq_j)

    # encode each of the two inputs into a vector with the convnet
    encoded_i = lstm_network(input_seq_i)
    encoded_j = lstm_network(input_seq_j)
    print("encoded_i", encoded_i, "\nencoded_j", encoded_j)

    distance = Lambda(st_euclidean_distance)([encoded_i, encoded_j, is_directed])
    print("distance", distance)

    siamese_net = Model(inputs=[input_seq_i, input_seq_j, is_directed], outputs=distance)

conv1d_1 Tensor("conv1d_1/Relu:0", shape=(?, ?, 320), dtype=float32, device=/device:CPU:0)
max pooling_1 Tensor("max_pooling1d_1/Squeeze:0", shape=(?, ?, 320), dtype=float32, device=/device:CPU:0)
brnn Tensor("bidirectional_1/concat:0", shape=(?, 640), dtype=float32, device=/device:CPU:0)
embedding Tensor("dense_3/BiasAdd:0", shape=(?, 512), dtype=float32, device=/device:CPU:0)
lstm_network.input Tensor("input_1:0", shape=(?, ?, 6), dtype=float32, device=/device:CPU:0)
lstm_network.output Tensor("dense_3/BiasAdd:0", shape=(?, 512), dtype=float32, device=/device:CPU:0)
input_seq_i Tensor("input_seq_i:0", shape=(1024, ?, 6), dtype=float32)
input_seq_j Tensor("input_seq_j:0", shape=(1024, ?, 6), dtype=float32)
encoded_i Tensor("model_1/dense_3/BiasAdd:0", shape=(1024, 512), dtype=float32, device=/device:CPU:0) 
encoded_j Tensor("model_1_1/dense_3/BiasAdd:0", shape=(1024, 512), dtype=float32, device=/device:CPU:0)
distance Tensor("lambda_1/Sqrt:0", shape=(1024, 1), dtype=float32, device=/d

In [12]:
siamese_net = multi_gpu_model(siamese_net, gpus=4, cpu_merge=True, cpu_relocation=False)

In [13]:
#//TODO: get layerwise learning rates and momentum annealing scheme described in paperworking
siamese_net.compile(loss=contrastive_loss, 
                    optimizer=RMSprop(),
                    metrics=[accuracy])

siamese_net.count_params()


77322717

In [14]:
# Tensorboard
# tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

# Data Generator

In [15]:
from moge.network.data_generator import DataGenerator

generator_train = DataGenerator(network=network, get_training_data=False, negative_sampling_ratio=2.0,
                          maxlen=max_length, padding='post', truncating="post",
                          batch_size=batch_size, dim=input_shape, 
                          shuffle=True, seed=0)

generator_test = DataGenerator(network=network_test, get_training_data=False, negative_sampling_ratio=2.0,
                          maxlen=max_length, padding='post', truncating="post",
                          batch_size=batch_size, dim=input_shape, 
                          shuffle=True, seed=0)

X, y = generator_train.__getitem__(0)
print("X:", [(k, v.shape) for k, v in X.items()], "\ny:", y.shape)
X, y = generator_test.__getitem__(0)
print("X:", [(k, v.shape) for k, v in X.items()], "\ny:", y.shape)

Genes info columns: ['locus_type', 'location', 'Family', 'Transcript sequence', 'Disease association', 'Transcript length']
Number of nodes without seq removed: 2572
num_words: None {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'U': 5}
Ed_count: 419905 , Eu_count: 425377 , En_count: 688212
Ens_count: 839810
Genes info columns: ['locus_type', 'location', 'Family', 'Transcript sequence', 'Disease association', 'Transcript length']
Number of nodes without seq removed: 2572
num_words: None {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'U': 5}
Ed_count: 22332 , Eu_count: 49716 , En_count: 36895
Ens_count: 44664
X: [('input_seq_j', (1024, 700, 6)), ('input_seq_i', (1024, 700, 6)), ('is_directed', (1024, 1))] 
y: (1024, 1)
X: [('input_seq_j', (1024, 700, 6)), ('input_seq_i', (1024, 700, 6)), ('is_directed', (1024, 1))] 
y: (1024, 1)


# Train

In [None]:
history = siamese_net.fit_generator(generator_train, epochs=10, validation_data=generator_test,
                          use_multiprocessing=True, workers=8)

In [None]:
# summarize history for accuracy
plt.figure(figsize=(15,15))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# summarize history for accuracy
plt.figure(figsize=(15,15))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Save Model

In [None]:
lstm_network.save("lstm_network_contrastive_eucl_512_linear_varlen_disease.h5")

In [None]:
siamese_net.summary()

In [None]:
lstm_network.summary()

# Evaluate on test data

In [None]:
generator_val = DataGenerator(network=network_val, get_training_data=False, negative_sampling_ratio=2.0,
                          maxlen=max_length, padding='post', truncating="post",
                          batch_size=batch_size, dim=input_shape, 
                          shuffle=True, seed=0)

In [None]:
siamese_net.evaluate_generator(generator_val, use_multiprocessing=True, workers=8)

In [None]:
%%javascript
Jupyter.notebook.session.delete();

# Inference

In [None]:
np.set_printoptions(precision=2, suppress=True)

In [None]:
i = 0
# i += 1
X, y = generator.__getitem__(i)
print("X:", [(k, v.shape) for k, v in X.items()], "\ny:", y.shape)
list(zip(siamese_net.predict_on_batch(X).flatten().tolist(), 
         y.flatten().tolist(), 
         X["is_directed"].tolist()))

In [None]:
seqs = generator.get_sequence_data(range(len(generator.node_list)))

In [None]:
embs = lstm_network.predict(seqs)

In [None]:
embs.shape