In [1]:
# Necessary imports
%load_ext autoreload
%autoreload 2\

import networkx as nx
import numpy as np
import tensorflow as tf

from keras import backend as K
from keras.layers import Dense
from keras.models import Model, Sequential


from TCGAMultiOmics.multiomics import MultiOmicsData
from moge.network.heterogeneous_network import HeterogeneousNetwork


Using TensorFlow backend.


#  Import network from file

In [2]:
import pickle

# WRITE
# with open('moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_network_test_05_val_01_seed_0.pickle', 'wb') as file:
#     pickle.dump(network, file)

# READ
with open('moge/data/lncRNA_miRNA_mRNA/lncRNA-miRNA-mRNA_network_new.pickle', 'rb') as file:
# with open('moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_network_biogrid.pickle', 'rb') as file:
    network = pickle.load(file)
#     network.remove_extra_nodes()
#     network.node_list = network.all_nodes
#     node_list = network.node_list

In [3]:
for u,v,d in network.G.edges(data=True):
    if d["type"] == 'u_n':
        d['weight']+=1e-8

In [4]:
# READ edgelists
# with open('moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_network_test_05_val_01_seed_0_test_edges.pickle', 'rb') as file:
#     test_edges_dict = pickle.load(file)
    
# with open('moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_network_test_05_val_01_seed_0_val_edges.pickle', 'rb') as file:
#     val_edges_dict = pickle.load(file)

# Load training data

In [5]:
# X, y = network.multi_omics_data.load_data(modalities=["MIR", "GE"])

In [6]:
# network.multi_omics_data.external_data_path = "/home/jonny/PycharmProjects/Bioinformatics_ExternalData/"

In [7]:
# X["MIR"].shape

# Training Source Target Graph Embedding

In [8]:
from keras.layers import Input, Conv1D, Lambda, Dot, Dense, Flatten, MaxPooling1D, Lambda, Convolution1D, Layer
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras import backend as K
import keras

from keras.optimizers import SGD, Adam, RMSprop
from keras.losses import binary_crossentropy
from keras.metrics import kullback_leibler_divergence, binary_crossentropy, binary_accuracy

from keras.utils import to_categorical

from keras.callbacks import TensorBoard

def W_init(shape, name=None):
    """Initialize weights as in paper"""
    values = np.random.normal(loc=0,scale=1e-2,size=shape)
    return K.variable(values,name=name)
#//TODO: figure out how to initialize layer biases in keras.
def b_init(shape, name=None):
    """Initialize bias as in paper"""
    values=np.random.normal(loc=0.5,scale=1e-2,size=shape)
    return K.variable(values,name=name)

In [9]:
K.clear_session()
tf.reset_default_graph()
# sess.close()
sess = tf.InteractiveSession()

In [10]:
# INPUT PARAMETERS
max_length = 500
input_shape = (None, 6)
batch_size = 512

_d = 512

In [11]:
# Inputs
E_ij = Input(batch_shape=(batch_size, 1), name="E_ij")
input_seq_i = Input(batch_shape=(batch_size, *input_shape), name="input_seq_i")
input_seq_j = Input(batch_shape=(batch_size, *input_shape), name="input_seq_j")
is_directed = Input(batch_shape=(batch_size, 1), dtype=tf.bool, name="is_directed")


In [12]:
# Distance functions
def euclidean_distance(inputs):
    x, y, _ = inputs
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

def switch_emb(inputs):
    emb_i, emb_j, is_directed = inputs
    return K.switch(is_directed, 
                    [emb_i[:, 0:int(_d/2)] - emb_j[:, int(_d/2):_d]], [emb_i, emb_j]) 

def st_euclidean_distance(inputs):
    emb_i, emb_j, is_directed = inputs
    sum_directed = K.sum(K.square(emb_i[:, 0:int(_d/2)] - emb_j[:, int(_d/2):_d]), axis=1, keepdims=True)
    sum_undirected = K.sum(K.square(emb_i - emb_j), axis=1, keepdims=True)
    sum_switch = K.switch(is_directed, sum_directed, sum_undirected)
    return K.sqrt(K.maximum(sum_switch, K.epsilon()))

def st_embedding_probability(inputs):
    emb_i, emb_j, is_directed = inputs
    dot_directed = Dot(axes=1)([emb_i[:, 0:int(_d/2)], emb_j[:, int(_d/2):_d]])
    dot_undirected = Dot(axes=1)([emb_i, emb_j])
    return K.switch(is_directed, K.sigmoid(dot_directed), K.sigmoid(dot_undirected))

def st_embedding_probability_w_dense(inputs):
    emb_i, emb_j, is_directed = inputs
    directed = Dense(1, activation='sigmoid')(Dot(axes=1)([emb_i[:, 0:int(_d/2)], emb_j[:, int(_d/2):_d]]))
    undirected = Dense(1, activation='sigmoid')(Dot(axes=1)([emb_i, emb_j]))
    return K.switch(is_directed, directed, undirected)

def st_l1_distance(inputs):
    emb_i, emb_j, is_directed = inputs
    L1_layer = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))
    directed_distance = Dense(1, activation='sigmoid')(L1_layer([emb_i[:, 0:int(_d/2)], emb_j[:, int(_d/2):_d]]))
    undirected_distance = Dense(1, activation='sigmoid')(L1_layer([emb_i, emb_j]))
    
    return K.switch(is_directed, directed_distance, undirected_distance)

In [13]:
from keras.layers import LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

def create_base_network(input_shape):
    """ Base network to be shared (eq. to feature extraction).
    """
    input = Input(shape=input_shape)
#     x = Flatten()(input)
    x = Convolution1D(filters=320, kernel_size=26, input_shape=input_shape, activation='relu')(input)
    print("conv1d", x)
    x = MaxPooling1D(pool_size=13, strides=13)(x) # Similar to DanQ Model
    print("max pooling", x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(320, return_sequences=False, return_state=False))(x)
    print("brnn", x)
    x = Dropout(0.5)(x)
#     x = GlobalMaxPooling1D()(x)
#     print("GAP pooling", x)
    
    x = Dense(75*640, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(925, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(_d, activation='sigmoid')(x) # Embedding space
    return Model(input, x)

In [14]:
# Loss function
def contrastive_loss(y_true, y_pred):
    ''' Contrastive loss from Hadsell-et-al.'06 
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1.0
    return K.mean(y_true * K.square(y_pred) +
                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def regularized_cross_entropy(y_true, y_pred):
    return K.mean(y_true * K.log(y_pred) + (1 - y_true) * K.log(1 - y_pred))

def kl_loss(y_true, y_pred):
    return -K.mean(y_true * K.log(y_pred))

# Metrics
def accuracy(y_true, y_pred):
    ''' Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

In [15]:
# build create_base_network to use in each siamese 'leg'
lstm_network = create_base_network(input_shape=(max_length, 6))

print("lstm_network.input", lstm_network.input)
print("lstm_network.output", lstm_network.output)
print("input_seq_i", input_seq_i)
print("input_seq_j", input_seq_j)

# encode each of the two inputs into a vector with the convnet
encoded_i = lstm_network(input_seq_i)
encoded_j = lstm_network(input_seq_j)
print("encoded_i", encoded_i, "\nencoded_j", encoded_j)

distance = Lambda(st_euclidean_distance)([encoded_i, encoded_j, is_directed])
print("distance", distance)

siamese_net = Model(inputs=[input_seq_i, input_seq_j, is_directed], outputs=distance)

conv1d Tensor("conv1d_1/Relu:0", shape=(?, 475, 320), dtype=float32)
max pooling Tensor("max_pooling1d_1/Squeeze:0", shape=(?, 36, 320), dtype=float32)
brnn Tensor("bidirectional_1/concat:0", shape=(?, 640), dtype=float32)
lstm_network.input Tensor("input_1:0", shape=(?, 500, 6), dtype=float32)
lstm_network.output Tensor("dense_3/Sigmoid:0", shape=(?, 128), dtype=float32)
input_seq_i Tensor("input_seq_i:0", shape=(512, 500, 6), dtype=float32)
input_seq_j Tensor("input_seq_j:0", shape=(512, 500, 6), dtype=float32)
encoded_i Tensor("model_1/dense_3/Sigmoid:0", shape=(512, 128), dtype=float32) 
encoded_j Tensor("model_1_1/dense_3/Sigmoid:0", shape=(512, 128), dtype=float32)
distance Tensor("lambda_1/Sqrt:0", shape=(512, 1), dtype=float32)


In [16]:
#//TODO: get layerwise learning rates and momentum annealing scheme described in paperworking
siamese_net.compile(loss=contrastive_loss, 
                    optimizer=RMSprop(),
                    metrics=[accuracy])

siamese_net.count_params()


76978653

In [17]:
# Tensorboard
# tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

# Data Generator

In [18]:
from moge.network.data_generator import DataGenerator

generator = DataGenerator(network=network, get_training_data=False, negative_sampling_ratio=2.0,
                          maxlen=max_length, padding='post', truncating="post",
                          batch_size=batch_size, dim=input_shape, 
                          shuffle=True, seed=0)

X, y = generator.__getitem__(0)
print("X:", [(k, v.shape) for k, v in X.items()], "\ny:", y.shape)

Genes info columns: Index(['Transcript sequence', 'Disease association'], dtype='object')
Number of nodes without seq removed: 2572
num_words: None {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'U': 5}
Ed_count: 446712 , Eu_count: 828043 , En_count: 733188
Ens_count: 1816322
X: [('input_seq_j', (512, 500, 6)), ('input_seq_i', (512, 500, 6)), ('is_directed', (512, 1))] 
y: (512, 1)


# Train

In [None]:
siamese_net.fit_generator(generator, epochs=10, use_multiprocessing=True, workers=4)

Epoch 1/10
Epoch 2/10
Epoch 3/10

In [22]:
import h5py
lstm_network.save("lstm_network_contrastive_st-eucl_128.h5")

In [29]:
lstm_network.outputs

[<tf.Tensor 'dense_3/Sigmoid:0' shape=(?, 128) dtype=float32>]

# Inference

In [30]:
i = 0
# i += 1
X, y = generator.__getitem__(i)
print(lstm_network.predict(X), y)

ValueError: No data provided for "input_1". Need data for each key in: ['input_1']

In [None]:
X["input_seq_i"].shape

In [None]:
y.shape