In [2]:
#!/usr/bin/env python
#This script is used to train the model
import click as ck
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
import math
from collections import deque

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Dense, Embedding, Conv1D, Flatten, Concatenate,
    MaxPooling1D, Dropout, Maximum, Layer,LSTM, Dense,TimeDistributed,experimental,Bidirectional
)
# run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
# from tensorflow.keras.layers.experimental import (RandomFourierFeatures)
# from tensorflow.keras.layers import (
#     Input, Dense, Embedding, Conv1D, Flatten, Concatenate,
#     MaxPooling1D, Dropout,LSTM, Dense,TimeDistributed,
# )
from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from tensorflow.python.ops import math_ops
from tensorflow.keras import regularizers

from sklearn.metrics import roc_curve, auc, matthews_corrcoef
from aminoacids import MAXLEN, to_onehot
from utils import Ontology, FUNC_DICT, is_exp_code

# from kerastuner.tuners import RandomSearch
# from kerastuner import HyperModel

logging.basicConfig(level=logging.DEBUG)
run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
print("GPU Available: ", tf.test.is_gpu_available())




Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


GPU Available:  False


In [18]:
ff=7000
n1=int (ff*0.7)
n2=int(ff*0.2)
n3=int(ff*0.1)


In [19]:
class HPOLayer(Layer):

    def __init__(self, nb_classes, **kwargs):
        self.nb_classes = nb_classes
        self.hpo_matrix = np.zeros((nb_classes, nb_classes), dtype=np.float32)
        super(HPOLayer, self).__init__(**kwargs)

    def set_hpo_matrix(self, hpo_matrix):
        self.hpo_matrix = hpo_matrix

    def get_config(self):
        config = super(HPOLayer, self).get_config()
        config['nb_classes'] = self.nb_classes
        return config
    
    def build(self, input_shape):
        assert input_shape[1] == self.nb_classes
        self.kernel = K.variable(
            self.hpo_matrix, name='{}_kernel'.format(self.name))
        self.non_trainable_weights.append(self.kernel)
        super(HPOLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, x):
        x = tf.keras.backend.repeat(x, self.nb_classes)
        return tf.math.multiply(x, self.kernel)

    def compute_output_shape(self, input_shape):
        return [input_shape[0], self.nb_classes, self.nb_classes] 


In [20]:





def main(hp_file, data_file, terms_file, gos_file, model_file,
         out_file, fold, batch_size, epochs, load, logger_file, threshold,
         device):
    gos_df = pd.read_pickle(gos_file)
#     gos_df = gos_df.iloc[:2000,:] 
    gos = gos_df['gos'].values.flatten()
    gos_dict = {v: i for i, v in enumerate(gos)}

    # cross validation settings
    # model_file = f'fold{fold}_' + model_file
    # out_file = f'fold{fold}_' + out_file
    params = {
        'input_shape': (len(gos),),
        'nb_layers': 1,
        'loss': 'binary_crossentropy',
        'rate': 0.3,
        'learning_rate': 0.001,
        'units': 1500, # 750
        'model_file': model_file
    }
    
    print('Params:', params)
    global hpo
    hpo = Ontology(hp_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms_df = terms_df.iloc[:2600,:] 
    global terms
    terms = terms_df['terms'].values.flatten()
    print('Phenotypes', len(terms))
    global term_set
    term_set = set(terms)
    train_df, valid_df, test_df = load_data(data_file, terms, fold)
    terms_dict = {v: i for i, v in enumerate(terms)}
    hpo_matrix = get_hpo_matrix(hpo, terms_dict)
    nb_classes = len(terms)
    params['nb_classes'] = nb_classes
    print(len(terms_dict))
    test_steps = int(math.ceil(len(test_df) / batch_size))
    test_generator = DFGenerator(test_df, gos_dict, terms_dict,
                                 batch_size)
    valid_steps = int(math.ceil(len(valid_df) / batch_size))
    train_steps = int(math.ceil(len(train_df) / batch_size))

    xy_generator = DFGenerator(train_df, gos_dict, terms_dict,
                                  len(train_df))
    x, y = xy_generator[0]
    val_generator = DFGenerator(valid_df, gos_dict, terms_dict,
                                  len(valid_df))
    val_x, val_y = val_generator[0]

    train_generator = DFGenerator(train_df, gos_dict, terms_dict,
                                  batch_size)
    valid_generator = DFGenerator(valid_df, gos_dict, terms_dict,
                                  batch_size)
    
    with tf.device(device):
        if load:
            print('Loading pretrained model')
            model = load_model(model_file, custom_objects={'HPOLayer': HPOLayer})
            flat_model = load_model(model_file + '_flat.h5')
        else:
            print('Creating a new model')
            flat_model = MyHyperModel(params)
            # flat_model = create_flat_model(params)

            print("Training data size: %d" % len(train_df))
            print("Validation data size: %d" % len(valid_df))
            checkpointer = ModelCheckpoint(
                filepath=model_file + '_flat.h5',
                verbose=1, save_best_only=True)
            earlystopper = EarlyStopping(monitor='val_loss', patience=6, verbose=1)
            logger = CSVLogger(logger_file)

            # print('Starting training the flat model')
            # flat_model.summary()
            # flat_model.fit(
            #     train_generator,
            #     steps_per_epoch=train_steps,
            #     epochs=epochs,
            #     validation_data=valid_generator,
            #     validation_steps=valid_steps,
            #     max_queue_size=batch_size,
            #     workers=12,
            #     callbacks=[checkpointer, earlystopper])

            tuner = RandomSearch(
                flat_model,
                objective='val_loss',
                max_trials=1,
                directory='data-cafa',
                project_name='pheno')
            tuner.search(
                x, y, epochs=3, validation_data=(val_x, val_y),
                callbacks=[earlystopper])
            tuner.results_summary()
            logging.info('Loading best model')
            flat_model = tuner.get_best_models(num_models=1)[0]
            flat_model.summary()
            loss = flat_model.evaluate(val_x, val_y)
            print('Valid loss %f' % loss)
            flat_model.save(model_file + '_flat.h5')

            model = create_model(params, hpo_matrix)

            checkpointer = ModelCheckpoint(
                filepath=model_file,
                verbose=1, save_best_only=True)
            model.summary()
            print('Starting training the flat model')
            model.fit(
                train_generator,
                steps_per_epoch=train_steps,
                epochs=epochs,
                validation_data=valid_generator,
                validation_steps=valid_steps,
                max_queue_size=batch_size,
                workers=12,
                callbacks=[logger, checkpointer, earlystopper])

            logging.info('Loading best model')
            model = load_model(model_file, custom_objects={'HPOLayer': HPOLayer})
            flat_model = load_model(model_file + '_flat.h5')
            
        logging.info('Evaluating model')
        loss = flat_model.evaluate(test_generator, steps=test_steps)
#         print('Flat Test loss %f' % loss)
        loss = model.evaluate(test_generator, steps=test_steps)
#         print('Test loss %f' % loss)

        logging.info('Predicting')
        preds = model.predict(test_generator, steps=test_steps, verbose=1)
        flat_preds = flat_model.predict(test_generator, steps=test_steps, verbose=1)
        all_terms_df = pd.read_pickle(terms_file)
        all_terms = all_terms_df['terms'].values
        all_terms_dict = {v:k for k,v in enumerate(all_terms)}
        all_labels = np.zeros((len(test_df), len(all_terms)), dtype=np.int32)
        for i, row in enumerate(test_df.itertuples()):
            for hp_id in row.hp_annotations:
                if hp_id in all_terms_dict:
                    all_labels[i, all_terms_dict[hp_id]] = 1
        
        all_preds = np.zeros((len(test_df), len(all_terms)), dtype=np.float32)
        all_flat_preds = np.zeros((len(test_df), len(all_terms)), dtype=np.float32)
        for i in range(len(test_df)):
            for j in range(nb_classes):
                all_preds[i, all_terms_dict[terms[j]]] = preds[i, j]
                all_flat_preds[i, all_terms_dict[terms[j]]] = flat_preds[i, j]
        logging.info('Computing performance:')
        roc_auc = compute_roc(all_labels, all_preds)
        print('ROC AUC: %.2f' % (roc_auc,))
        flat_roc_auc = compute_roc(all_labels, all_flat_preds)
        print('FLAT ROC AUC: %.2f' % (flat_roc_auc,))
        test_df['preds'] = list(preds)
        print(test_df)
        logging.info('Saving predictions')
        test_df.to_pickle(out_file)

        test_df['preds'] = list(flat_preds)
        test_df.to_pickle(out_file + '_flat.pkl')



In [21]:
def compute_roc(labels, preds):
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten())
    roc_auc = auc(fpr, tpr)

    return roc_auc

def load_data(data_file, terms, fold=1):
    df = pd.read_pickle(data_file)
    # Split train/valid
    n = len(df)
    n1=int (n*0.5)
    n2=int(n*0.4)
    n3=int(n*0.1)
    index = np.arange(n)
    np.random.seed(seed=10)
    np.random.shuffle(index)
    index = list(index)
    train_index = []
    test_index = []
    print()
    # fn = n / 5
    # # 5 fold cross-validation
    # for i in range(1, 6):
    #     start = int((i - 1) * fn)
    #     end = int(i * fn)
    #     if i == fold:
    #         test_index += index[start:end]
    #     else:
    #         train_index += index[start:end]
    # assert n == len(test_index) + len(train_index)
    # train_df = df.iloc[train_index]
    # test_df = df.iloc[test_index]

    # valid_n = int(len(train_df) * 0.9)
    # valid_df = train_df.iloc[valid_n:]
    # train_df = train_df.iloc[:valid_n]
     
    # All Swissprot proteins
    train_n = int(n * 0.9)
#     train_df = df.iloc[index[:train_n]]
#     valid_df = df.iloc[index[train_n:]]
#     test_df=df.iloc[index[train_n:]]
    train_df = df.iloc[index[:n1]]
    valid_df = df.iloc[index[n1+1:n1+n2]]
    test_df=df.iloc[index[n1+n2+1:]]
    print(len(train_df),len(valid_df),len(test_df))
#     print(n1,n1+n2)
#     print(n1+n2,n1+n2+n3)
    # CAFA2 Test data
    # train_n = int(n * 0.9)
    # train_df = df.iloc[index[:train_n]]
    # valid_df = df.iloc[index[train_n:]]
    # test_df = pd.read_pickle('data-cafa/human_test.pkl')
    print(len(df), len(train_df), len(valid_df), len(test_df))
    return train_df, valid_df, test_df
    


In [22]:

# class MyHyperModel(HyperModel):

#     def __init__(self, params):
#         self.params = params

#     def build(self, hp):
#         inp = Input(shape=self.params['input_shape'], dtype=np.float32)
#         net = inp
#         for i in range(self.params['nb_layers']):
#             net = Dense(
#                 units=hp.Int(
#                     'units', min_value=250, max_value=2000, step=250),
#                 name=f'dense_{i}', activation='relu')(net)
#             net = Dropout(hp.Choice('rate', values=[0.3, 0.5]))(net)
#         output = Dense(
#             self.params['nb_classes'], activation='sigmoid',
#             name='dense_out')(net)

#         model = Model(inputs=inp, outputs=output)
#         model.summary()
#         model.compile(
#             optimizer=Adam(
#                 hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
#             loss=self.params['loss'])
#         return model


def get_hpo_matrix(hpo, terms_dict):
    nb_classes = len(terms_dict)
    res = np.zeros((nb_classes, nb_classes), dtype=np.float32)
    for hp_id, i in terms_dict.items():
        subs = hpo.get_term_set(hp_id)
        res[i, i] = 1
        for h_id in subs:
            if h_id in terms_dict:
                res[i, terms_dict[h_id]] = 1
    return res


def create_flat_model(params):
    inp = Input(shape=params['input_shape'], dtype=np.float32)
    net = inp
    for i in range(params['nb_layers']):
        net = Dense(
            units=params['units'], name=f'dense_{i}', activation='relu')(net)
        net = Dropout(rate=params['rate'])(net)
    net = Dense(
        params['nb_classes'], activation='sigmoid',
        name='dense_out')(net)
    output = Flatten()(net)
    model = Model(inputs=inp, outputs=output)
    model.summary()
    model.compile(
        optimizer=Adam(lr=params['learning_rate']),
        loss=params['loss'])
    logging.info('Compilation finished')

    return model





class DFGenerator(Sequence):                                                                                                               
                                                                                                                                         
    def __init__(self, df, gos_dict, terms_dict, batch_size):
        self.start = 0
        self.size = len(df)
        self.df = df
        self.batch_size = batch_size
        self.terms_dict = terms_dict
        self.gos_dict = gos_dict
                                                                                                                                         
    def __len__(self):                                                                                                                   
        return np.ceil(len(self.df) / float(self.batch_size)).astype(np.int32)                                                           
                                                                                                                                         
    def __getitem__(self, idx):                                                                                                          
        batch_index = np.arange(                                                                                                         
            idx * self.batch_size, min(self.size, (idx + 1) * self.batch_size))                                                          
        df = self.df.iloc[batch_index]                                                                                                   
        data_seq = np.zeros((len(df), MAXLEN, 21), dtype=np.float32)
        data_gos = np.zeros((len(df), len(self.gos_dict)), dtype=np.float32)
        labels = np.zeros((len(df), len(self.terms_dict)), dtype=np.int32)
        for i, row in enumerate(df.itertuples()):
            data_seq[i, :] = to_onehot(row.sequences)
            
            for item in row.deepgo_annotations:
                t_id, score = item.split('|')
                if t_id in self.gos_dict:
                    data_gos[i, self.gos_dict[t_id]] = float(score)

            for t_id in row.iea_annotations:
                if t_id in self.gos_dict:
                    data_gos[i, self.gos_dict[t_id]] = 1

            for t_id in row.go_annotations:
                if t_id in self.gos_dict:
                    data_gos[i, self.gos_dict[t_id]] = 1
                
            for t_id in row.hp_annotations:
                if t_id in self.terms_dict:
                    labels[i, self.terms_dict[t_id]] = 1
        return (data_gos, labels)
    


In [5]:
len(range (8,32,8))
max_kernel=32
Kernels=range (8,max_kernel,8)
for i in range (len(Kernels)):
    print(i)

0
1
2


## Model CNN only

In [6]:
def create_model(params, hpo_matrix):
    inp = Input(shape=params['input_shape'], dtype=np.float32)
    # Load flat model
    flat_model = load_model(params['model_file'] + '_flat.h5') 
    net = flat_model(inp) 
    hpo_layer = HPOLayer(params['nb_classes'])
    hpo_layer.trainable = False
    hpo_layer.set_hpo_matrix(hpo_matrix)
    net = hpo_layer(net)
    Matrix=[]
    max_kernel=32
    Kernels=range (8,max_kernel,8)
#     nb_filters=32
#     nb_filters=64
    nb_filters=20
    for i in range (len(Kernels)):
        conv = (Conv1D(filters=nb_filters,     
                       kernel_size=Kernels[i],     
                       padding='valid',kernel_initializer= 'glorot_normal'))(net)
        pool=MaxPooling1D(pool_size=16)(conv)
        
#         lstm=LSTM(Kernels[i], activation="tanh")(pool)
#         B_LSTM=(Bidirectional(LSTM(Kernels[i], return_sequences=True), input_shape=(10, 1)))(net)
        flat= Flatten()(pool)
        Matrix.append(flat)
    net=Concatenate(axis=1)(Matrix)
#     output = Flatten()(net)
#     output=RandomFourierFeatures( params['nb_classes'],kernel_initializer='laplacian',scale=None,trainable=False, name='Output_SVM')(net)
    
    #net=Dense(params['nb_classes'],activation='relu')(net)
    #output=Dense(params['nb_classes'], activation='sigmoid', name='dense_out')(net)
    output=Dense(params['nb_classes'], activation='sigmoid', name='dense_out')(net)

    model = Model(inputs=inp, outputs=output)
    
    model.summary()
    
#     model.compile( optimizer=Adam(lr=params['learning_rate']),   
    model.compile(
        optimizer=Adam(lr=params['learning_rate']),
        metrics=['accuracy'],
        loss=params['loss'])    
    logging.info('Compilation finished')

    return model



# CNN+LSTM

In [7]:
def create_model(params, hpo_matrix):
    inp = Input(shape=params['input_shape'], dtype=np.float32)
    # Load flat model
    flat_model = load_model(params['model_file'] + '_flat.h5') 
    net = flat_model(inp) 
    hpo_layer = HPOLayer(params['nb_classes'])
    hpo_layer.trainable = False
    hpo_layer.set_hpo_matrix(hpo_matrix)
    net = hpo_layer(net)
    Matrix=[]
    max_kernel=32
    Kernels=range (8,max_kernel,8)
#     nb_filters=32
#     nb_filters=64
    nb_filters=20
    for i in range (len(Kernels)):
        conv = (Conv1D(filters=nb_filters,     
                       kernel_size=Kernels[i],     
                       padding='valid',kernel_initializer= 'glorot_normal'))(net)
        pool=MaxPooling1D(pool_size=16)(conv)
        
        lstm=LSTM(Kernels[i], activation="tanh")(pool)
        flat= Flatten()(lstm)
        Matrix.append(flat)
    net=Concatenate(axis=1)(Matrix)
#     output = Flatten()(net)
#     output=RandomFourierFeatures( params['nb_classes'],kernel_initializer='laplacian',scale=None,trainable=False, name='Output_SVM')(net)
    
    #net=Dense(params['nb_classes'],activation='relu')(net)
    #output=Dense(params['nb_classes'], activation='sigmoid', name='dense_out')(net)
    output=Dense(params['nb_classes'], activation='sigmoid', name='dense_out')(net)

    model = Model(inputs=inp, outputs=output)
    
    model.summary()
    
#     model.compile( optimizer=Adam(lr=params['learning_rate']),   
    model.compile(
        optimizer=Adam(lr=params['learning_rate']),
        metrics=['accuracy'],
        loss=params['loss'])    
    logging.info('Compilation finished')

    return model



## Model LSTM only

In [None]:
def create_model(params, hpo_matrix):
    inp = Input(shape=params['input_shape'], dtype=np.float32)
    # Load flat model
    flat_model = load_model(params['model_file'] + '_flat.h5') 
    net = flat_model(inp) 
    hpo_layer = HPOLayer(params['nb_classes'])
    hpo_layer.trainable = False
    hpo_layer.set_hpo_matrix(hpo_matrix)
    net = hpo_layer(net)
    Matrix=[]
    max_kernel=32
    Kernels=range (8,max_kernel,8)
#     nb_filters=32
#     nb_filters=64
    nb_filters=20
    for i in range (len(Kernels)):
#         conv = (Conv1D(filters=nb_filters,     
#                        kernel_size=Kernels[i],     
#                        padding='valid',kernel_initializer= 'glorot_normal'))(net)
#         pool=MaxPooling1D(pool_size=16)(conv)
        
        lstm=LSTM(Kernels[i], activation="tanh")(net)
        flat= Flatten()(lstm)
        Matrix.append(flat)
    net=Concatenate(axis=1)(Matrix)
#     output = Flatten()(net)
#     output=RandomFourierFeatures( params['nb_classes'],kernel_initializer='laplacian',scale=None,trainable=False, name='Output_SVM')(net)
    
    #net=Dense(params['nb_classes'],activation='relu')(net)
    #output=Dense(params['nb_classes'], activation='sigmoid', name='dense_out')(net)
    output=Dense(params['nb_classes'], activation='sigmoid', name='dense_out')(net)

    model = Model(inputs=inp, outputs=output)
    
    model.summary()
    
#     model.compile( optimizer=Adam(lr=params['learning_rate']),   
    model.compile(
        optimizer=Adam(lr=params['learning_rate']),
        metrics=['accuracy'],
        loss=params['loss'])    
    logging.info('Compilation finished')

    return model



# Starting Trials

# Trial_7 --> CNN+LSTM

In [1]:
# main(hp_file, data_file, terms_file, gos_file, model_file,out_file, fold, batch_size, epochs, load, logger_file, threshold,device):

main('data/hp.obo',
     
     'data/My_Implementations/human.pkl',
     
     'data/all_terms.pkl',
     
     'data/My_Implementations/gos.pkl',
     
     'data/My_Implementations/Trial_7/model_mohamed.h5',
     
     'data/My_Implementations/Trial_7/predictions.pkl',
     
     1,
#batch_size
     10,
#Number of epochs     
     1024,
     
     False,
                                   
     'data/My_Implementations/Trial_7/training.csv',
     
     0.5,
    
     'GPU:0')

NameError: name 'main' is not defined

In [10]:
# main(hp_file, data_file, terms_file, gos_file, model_file,out_file, fold, batch_size, epochs, load, logger_file, threshold,device):

main('data/hp.obo',
     
     'data/My_Implementations/human.pkl',
     
     'data/all_terms.pkl',
     
     'data/My_Implementations/gos.pkl',
     
     'data/My_Implementations/Trial_7/model_mohamed.h5',
     
     'data/My_Implementations/Trial_7/predictions.pkl',
     
     1,
#batch_size
     10,
#Number of epochs     
     1024,
     
     'load',
                                   
     'data/My_Implementations/Trial_7/training.csv',
     
     0.5,
    
     'GPU:0')

Params: {'input_shape': (24274,), 'nb_layers': 1, 'loss': 'binary_crossentropy', 'rate': 0.3, 'learning_rate': 0.001, 'units': 1500, 'model_file': 'data/My_Implementations/Trial_7/model_mohamed.h5'}
Phenotypes 2600
3933 3539 394 394
2600
Loading pretrained model


INFO:root:Evaluating model


  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']




INFO:root:Predicting




INFO:root:Computing performance:


ROC AUC: 0.90
FLAT ROC AUC: 0.90
       genes                                     hp_annotations  \
3800   65109  {HP:0100490, HP:0000274, HP:0003549, HP:001182...   
2698  284111  {HP:0001098, HP:0030669, HP:0002063, HP:000736...   
3532    7045  {HP:0000549, HP:0007827, HP:0000495, HP:000049...   
58      8289  {HP:0002086, HP:0030669, HP:0005108, HP:000123...   
3689    7355  {HP:0001098, HP:0030669, HP:0007369, HP:000057...   
2       8195  {HP:0000512, HP:0000008, HP:0004383, HP:000079...   
208      353  {HP:0002719, HP:0012622, HP:0003774, HP:000079...   
3883   57167  {HP:0000008, HP:0030669, HP:0000544, HP:000115...   
2647   30009  {HP:0002086, HP:0000433, HP:0000005, HP:010058...   
938     1644  {HP:0001266, HP:0002019, HP:0000366, HP:000245...   
251   254394  {HP:0000008, HP:0000858, HP:0010460, HP:000081...   
391     8893  {HP:0008193, HP:0000008, HP:0001098, HP:000235...   
1299   10342  {HP:0001098, HP:0006802, HP:0030188, HP:000254...   
1372    2253  {HP:0000008, HP

[394 rows x 8 columns]


INFO:root:Saving predictions


# Trial_evaluating_onTrial_7_with_40% from the dataset=testset_12-8-2021

In [23]:
# main(hp_file, data_file, terms_file, gos_file, model_file,out_file, fold, batch_size, epochs, load, logger_file, threshold,device):

main('data/hp.obo',
     
     'data/My_Implementations/human.pkl',
     
     'data/all_terms.pkl',
     
     'data/My_Implementations/gos.pkl',
     
     'data/My_Implementations/Trial_7/model_mohamed.h5',
     
     'data/My_Implementations/Trial_7/predictions_40_perc.pkl',
     
     1,
#batch_size
     10,
#Number of epochs     
     1024,
     
     'load',
                                   
     'data/My_Implementations/Trial_7/training.csv',
     
     0.5,
    
     'GPU:0')

Params: {'input_shape': (24274,), 'nb_layers': 1, 'loss': 'binary_crossentropy', 'rate': 0.3, 'learning_rate': 0.001, 'units': 1500, 'model_file': 'data/My_Implementations/Trial_7/model_mohamed.h5'}
Phenotypes 2600

1966 1572 393
3933 1966 1572 393
2600
Loading pretrained model


DEBUG:h5py._conv:Creating converter from 3 to 5




INFO:root:Evaluating model




INFO:root:Predicting




INFO:root:Computing performance:


ROC AUC: 0.90


INFO:root:Saving predictions


FLAT ROC AUC: 0.90
       genes                                     hp_annotations  \
2698  284111  {HP:0100547, HP:0002355, HP:0012444, HP:000629...   
3532    7045  {HP:0000118, HP:0000005, HP:0007802, HP:000354...   
58      8289  {HP:0002817, HP:0100547, HP:0009115, HP:000199...   
3689    7355  {HP:0100547, HP:0012444, HP:0000574, HP:000000...   
2       8195  {HP:0010460, HP:0002817, HP:0010438, HP:000866...   
...      ...                                                ...   
3441    6911  {HP:0010460, HP:0100547, HP:0009115, HP:000339...   
1344   10397  {HP:0002817, HP:0006916, HP:0003474, HP:001101...   
527      974  {HP:0011821, HP:0011017, HP:0002721, HP:000000...   
3197   63925  {HP:0011821, HP:0100547, HP:0009115, HP:001244...   
1289    2137  {HP:0002817, HP:0100547, HP:0011017, HP:003031...   

                                         go_annotations  \
2698  {GO:0005623, GO:0015849, GO:0005886, GO:004442...   
3532  {GO:0005623, GO:0032502, GO:0050789, GO:005087...   

# Trial_6 --> CNN only

In [7]:
# main(hp_file, data_file, terms_file, gos_file, model_file,out_file, fold, batch_size, epochs, load, logger_file, threshold,device):

main('data/hp.obo',
     
     'data/My_Implementations/human.pkl',
     
     'data/all_terms.pkl',
     
     'data/My_Implementations/gos.pkl',
     
     'data/My_Implementations/Trial_6/model_mohamed.h5',
     
     'data/My_Implementations/Trial_6/predictions.pkl',
     
     1,
#batch_size
     10,
#Number of epochs     
     1024,
     
     False,
                                   
     'data/My_Implementations/Trial_6/training.csv',
     
     0.5,
    
     'GPU:0')

Params: {'input_shape': (24274,), 'nb_layers': 1, 'loss': 'binary_crossentropy', 'rate': 0.3, 'learning_rate': 0.001, 'units': 1500, 'model_file': 'data/My_Implementations/Trial_6/model_mohamed.h5'}
Phenotypes 2600

2753 785 393
3933 2753 785 393
2600
Creating a new model
Training data size: 2753
Validation data size: 785
INFO:tensorflow:Reloading Oracle from data-cafa\pheno\oracle.json


INFO:tensorflow:Reloading Oracle from data-cafa\pheno\oracle.json


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24274)]           0         
_________________________________________________________________
dense_0 (Dense)              (None, 250)               6068750   
_________________________________________________________________
dropout (Dropout)            (None, 250)               0         
_________________________________________________________________
dense_out (Dense)            (None, 2600)              652600    
Total params: 6,721,350
Trainable params: 6,721,350
Non-trainable params: 0
_________________________________________________________________
INFO:tensorflow:Reloading Tuner from data-cafa\pheno\tuner0.json


INFO:tensorflow:Reloading Tuner from data-cafa\pheno\tuner0.json


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24274)]           0         
_________________________________________________________________
dense_0 (Dense)              (None, 750)               18206250  
_________________________________________________________________
dropout (Dropout)            (None, 750)               0         
_________________________________________________________________
dense_out (Dense)            (None, 2600)              1952600   
Total params: 20,158,850
Trainable params: 20,158,850
Non-trainable params: 0
_________________________________________________________________
Train on 2753 samples, validate on 785 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


INFO:root:Loading best model


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24274)]           0         
_________________________________________________________________
dense_0 (Dense)              (None, 750)               18206250  
_________________________________________________________________
dropout (Dropout)            (None, 750)               0         
_________________________________________________________________
dense_out (Dense)            (None, 2600)              1952600   
Total params: 20,158,850
Trainable params: 20,158,850
Non-trainable params: 0
_________________________________________________________________
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24274)]           0         
__________________________________



Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 24274)]      0                                            
__________________________________________________________________________________________________
model (Model)                   (None, 2600)         20158850    input_2[0][0]                    
__________________________________________________________________________________________________
hpo_layer (HPOLayer)            (None, 2600, 2600)   6760000     model[1][0]                      
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 2593, 20)     416020      hpo_layer[0][0]                  
____________________________________________________________________________________________

INFO:root:Compilation finished


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 24274)]      0                                            
__________________________________________________________________________________________________
model (Model)                   (None, 2600)         20158850    input_2[0][0]                    
__________________________________________________________________________________________________
hpo_layer (HPOLayer)            (None, 2600, 2600)   6760000     model[1][0]                      
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 2593, 20)     416020      hpo_layer[0][0]                  
____________________________________________________________________________________________

  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


Train for 276 steps, validate for 79 steps
Epoch 1/1024
Epoch 00001: val_loss improved from inf to 0.13382, saving model to data/My_Implementations/Trial_6/model_mohamed.h5
Epoch 2/1024




Epoch 00002: val_loss improved from 0.13382 to 0.12971, saving model to data/My_Implementations/Trial_6/model_mohamed.h5
Epoch 3/1024
Epoch 00003: val_loss improved from 0.12971 to 0.12897, saving model to data/My_Implementations/Trial_6/model_mohamed.h5
Epoch 4/1024




Epoch 00004: val_loss improved from 0.12897 to 0.12622, saving model to data/My_Implementations/Trial_6/model_mohamed.h5
Epoch 5/1024
Epoch 00005: val_loss did not improve from 0.12622
Epoch 6/1024




Epoch 00006: val_loss did not improve from 0.12622
Epoch 7/1024
Epoch 00007: val_loss did not improve from 0.12622
Epoch 8/1024




Epoch 00008: val_loss did not improve from 0.12622
Epoch 9/1024
Epoch 00009: val_loss did not improve from 0.12622
Epoch 10/1024




Epoch 00010: val_loss did not improve from 0.12622
Epoch 00010: early stopping


INFO:root:Loading best model




INFO:root:Evaluating model


  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']




INFO:root:Predicting




INFO:root:Computing performance:


ROC AUC: 0.90
FLAT ROC AUC: 0.89
























       genes                                     hp_annotations  \
2698  284111  {HP:0000163, HP:0004302, HP:0000929, HP:000066...   
3532    7045  {HP:0000504, HP:0007809, HP:0007802, HP:000769...   
58      8289  {HP:0000163, HP:0001167, HP:0009810, HP:000092...   
3689    7355  {HP:0000163, HP:0000929, HP:0007367, HP:001097...   
2       8195  {HP:0000163, HP:0001167, HP:0045017, HP:000139...   
208      353  {HP:0012622, HP:0000790, HP:0010978, HP:000007...   
3883   57167  {HP:0011452, HP:0045060, HP:0001507, HP:000058...   
2647   30009  {HP:0002795, HP:0002086, HP:0012042, HP:000000...   
938     1644  {HP:0001276, HP:0002493, HP:0011024, HP:000061...   
251   254394  {HP:0000924, HP:0000858, HP:0004322, HP:000150...   
391     8893  {HP:0000929, HP:0000256, HP:0001276, HP:000249...   
1299   10342  {HP:0004302, HP:0007367, HP:0001276, HP:004029...   
1372    2253  {HP:0000163, HP:0003187, HP:0000929, HP:000017...   
1735   27445  {HP:0000163, HP:0000929, HP:0007367, HP:000127..

[393 rows x 8 columns]


INFO:root:Saving predictions


# Trial_8 --> LSTM only

In [24]:
# main(hp_file, data_file, terms_file, gos_file, model_file,out_file, fold, batch_size, epochs, load, logger_file, threshold,device):

main('data/hp.obo',
     
     'data/My_Implementations/human.pkl',
     
     'data/all_terms.pkl',
     
     'data/My_Implementations/gos.pkl',
     
     'data/My_Implementations/Trial_8/model_mohamed.h5',
     
     'data/My_Implementations/Trial_8/predictions.pkl',
     
     1,
#batch_size
     5,
#Number of epochs     
     1024,
     
     'load',
                                   
     'data/My_Implementations/Trial_8/training.csv',
     
     0.5,
    
     'GPU:0')

Params: {'input_shape': (24274,), 'nb_layers': 1, 'loss': 'binary_crossentropy', 'rate': 0.3, 'learning_rate': 0.001, 'units': 1500, 'model_file': 'data/My_Implementations/Trial_8/model_mohamed.h5'}
Phenotypes 2600

2753 785 393
3933 2753 785 393
2600
Loading pretrained model


OSError: SavedModel file does not exist at: data/My_Implementations/Trial_8/model_mohamed.h5\{saved_model.pbtxt|saved_model.pb}

In [7]:
# main(hp_file, data_file, terms_file, gos_file, model_file,out_file, fold, batch_size, epochs, load, logger_file, threshold,device):

main('data/hp.obo',
     
     'data/My_Implementations/human.pkl',
     
     'data/all_terms.pkl',
     
     'data/My_Implementations/gos.pkl',
     
     'data/My_Implementations/Trial_8/model_mohamed.h5',
     
     'data/My_Implementations/Trial_8/predictions.pkl',
     
     1,
#batch_size
     5,
#Number of epochs     
     1024,
     
     False,
                                   
     'data/My_Implementations/Trial_8/training.csv',
     
     0.5,
    
     'GPU:0')

Params: {'input_shape': (24274,), 'nb_layers': 1, 'loss': 'binary_crossentropy', 'rate': 0.3, 'learning_rate': 0.001, 'units': 1500, 'model_file': 'data/My_Implementations/Trial_8/model_mohamed.h5'}
Phenotypes 2600

2753 785 393
3933 2753 785 393
2600
Creating a new model
Training data size: 2753
Validation data size: 785
INFO:tensorflow:Reloading Oracle from data-cafa\pheno\oracle.json


INFO:tensorflow:Reloading Oracle from data-cafa\pheno\oracle.json


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24274)]           0         
_________________________________________________________________
dense_0 (Dense)              (None, 250)               6068750   
_________________________________________________________________
dropout (Dropout)            (None, 250)               0         
_________________________________________________________________
dense_out (Dense)            (None, 2600)              652600    
Total params: 6,721,350
Trainable params: 6,721,350
Non-trainable params: 0
_________________________________________________________________
INFO:tensorflow:Reloading Tuner from data-cafa\pheno\tuner0.json


INFO:tensorflow:Reloading Tuner from data-cafa\pheno\tuner0.json


INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


INFO:root:Loading best model


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24274)]           0         
_________________________________________________________________
dense_0 (Dense)              (None, 1000)              24275000  
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_out (Dense)            (None, 2600)              2602600   
Total params: 26,877,600
Trainable params: 26,877,600
Non-trainable params: 0
_________________________________________________________________
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24274)]           0         
__________________________________



Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 24274)]      0                                            
__________________________________________________________________________________________________
model (Model)                   (None, 2600)         26877600    input_2[0][0]                    
__________________________________________________________________________________________________
hpo_layer (HPOLayer)            (None, 2600, 2600)   6760000     model[1][0]                      
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 8)            83488       hpo_layer[0][0]                  
____________________________________________________________________________________________

INFO:root:Compilation finished


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 24274)]      0                                            
__________________________________________________________________________________________________
model (Model)                   (None, 2600)         26877600    input_2[0][0]                    
__________________________________________________________________________________________________
hpo_layer (HPOLayer)            (None, 2600, 2600)   6760000     model[1][0]                      
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 8)            83488       hpo_layer[0][0]                  
____________________________________________________________________________________________

  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


Train for 551 steps, validate for 157 steps
Epoch 1/1024








InternalError:  [_Derived_]  Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 2600, 8, 1, 2600, 5, 8] 
	 [[{{node gradients/CudnnRNN_grad/CudnnRNNBackprop}}]]
	 [[StatefulPartitionedCall]] [Op:__inference_distributed_function_9236]

Function call stack:
distributed_function -> distributed_function -> distributed_function
