In [1]:
import tensorflow as tf
import numpy as np
import os
from tqdm import tqdm
import argparse
from utils.utils import create_tfr_files, prob_to_secondary_structure
from utils.FastaMLtoSL import FastaMLtoSL
import time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
print(tf.__version__)

1.14.0


In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

inputs = '/home/gwsuh/ksj/test.fasta'
FastaMLtoSL(inputs)

base_path = os.path.dirname(os.path.realpath('/home/gwsuh/ksj/SPOT-RNA/spotrna.ipynb'))
input_file = os.path.basename(inputs)

create_tfr_files(inputs, base_path, input_file)

with open(inputs) as file:
    input_data = [line.strip() for line in file.read().splitlines() if line.strip()]

count = int(len(input_data)/2)

ids = [input_data[2*i].replace(">", "") for i in range(count)]
sequences = {}

>> Opening FASTA file...
>> Converting FASTA file from multiline to single line and writing to file.
>> Done!

Preparing tfr records file for SPOT-RNA:


100%|██████████| 1/1 [00:00<00:00, 17.06it/s]


In [3]:
print(ids)
print(base_path)
print(input_file)

['AAAB01008933.1001579.ct']
/home/gwsuh/ksj/SPOT-RNA
test.fasta


In [4]:
for i,I in enumerate(ids):
    sequences[I] = input_data[2*i+1].replace(" ", "").upper().replace("T", "U")

os.environ["CUDA_VISIBLE_DEVICES"]= str(5)
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
NUM_MODELS = 5

test_loc = [os.path.join(base_path, 'input_tfr_files', input_file+'.tfrecords')]

outputs = {}
mask = {}
def sigmoid(x):
    return 1/(1+np.exp(-np.array(x, dtype=np.float128)))

In [5]:
print(test_loc)
print(len(sequences['AAAB01008933.1001579.ct']))

['/home/gwsuh/ksj/SPOT-RNA/input_tfr_files/test.fasta.tfrecords']
141


In [5]:
for MODEL in range(NUM_MODELS):
    config = tf.compat.v1.ConfigProto()
    config.allow_soft_placement=True
    config.log_device_placement=False
    config.gpu_options.allow_growth = True

    print('\nPredicting for SPOT-RNA model '+str(MODEL))
    with tf.compat.v1.Session(config=config) as sess:
        saver = tf.compat.v1.train.import_meta_graph(os.path.join(base_path, 'SPOT-RNA-models', 'model' + str(MODEL) + '.meta'))
        saver.restore(sess,os.path.join(base_path, 'SPOT-RNA-models', 'model' + str(MODEL)))
        graph = tf.compat.v1.get_default_graph()
        init_test =  graph.get_operation_by_name('make_initializer_2')
        tmp_out = graph.get_tensor_by_name('output_FC/fully_connected/BiasAdd:0')
        name_tensor = graph.get_tensor_by_name('tensors_2/component_0:0')
        RNA_name = graph.get_tensor_by_name('IteratorGetNext:0')
        label_mask = graph.get_tensor_by_name('IteratorGetNext:4')
        sess.run(init_test,feed_dict={name_tensor:test_loc})
        
        pbar = tqdm(total = count)
        while True:
            try:        
                out = sess.run([tmp_out,RNA_name,label_mask],feed_dict={'dropout:0':1})
                out[1] = out[1].decode()
                mask[out[1]] = out[2]
                
                if MODEL == 0:
                    outputs[out[1]] = [sigmoid(out[0])]
                else:
                    outputs[out[1]].append(sigmoid(out[0]))
                #print('RNA name: %s'%(out[1]))
                pbar.update(1)
            except tf.errors.OutOfRangeError:
                break
        pbar.close()
    tf.compat.v1.reset_default_graph()


Predicting for SPOT-RNA model 0


100%|██████████| 1/1 [00:00<00:00,  1.29it/s]



Predicting for SPOT-RNA model 1


100%|██████████| 1/1 [00:00<00:00,  1.09it/s]



Predicting for SPOT-RNA model 2


100%|██████████| 1/1 [00:01<00:00,  1.26s/it]



Predicting for SPOT-RNA model 3


100%|██████████| 1/1 [00:01<00:00,  1.83s/it]



Predicting for SPOT-RNA model 4


100%|██████████| 1/1 [00:02<00:00,  2.30s/it]


In [7]:
np.array(outputs['AAAB01008933.1001579.ct']).shape

(5, 9730, 1)

In [6]:
RNA_ids = [i for i in list(outputs.keys())]
ensemble_outputs = {}

In [7]:
def output_mask(seq, NC=True):
    if NC:
        include_pairs = ['AU', 'UA', 'GC', 'CG', 'GU', 'UG', 'CC', 'GG', 'AG', 'CA', 'AC', 'UU', 'AA', 'CU', 'GA', 'UC']
    else:
        include_pairs = ['AU', 'UA', 'GC', 'CG', 'GU', 'UG']
    mask = np.zeros((len(seq), len(seq)))
    for i, I in enumerate(seq):
        for j, J in enumerate(seq):
            if str(I) + str(J) in include_pairs:
                mask[i, j] = 1
    return mask

In [8]:
for i in RNA_ids:
    ensemble_outputs[i] = np.mean(outputs[i],0)
    
    # print('\n\n\n\n\n shape: {} \n\n\n\n\n'.format(np.array(ensemble_outputs[i]).shape))
    
    ensemble_outputs_post = ensemble_outputs[i]
    label_mask = mask[i]
    seq = sequences[i]
    name = i
    Threshold = 0.335
    test_output = ensemble_outputs_post
    mask_post = output_mask(seq)
    inds = np.where(label_mask == 1)
    y_pred = np.zeros(label_mask.shape)
    
    for i in range(test_output.shape[0]):
        y_pred[inds[0][i], inds[1][i]] = test_output[i]
    y_pred = np.multiply(y_pred, mask_post)
    
    print(y_pred.shape)
    
    
    # prob_to_secondary_structure(ensemble_outputs[i], mask[i], sequences[i], i, args, base_path)

(141, 141)


In [None]:
y_pred

array([[0.00000000e+00, 0.00000000e+00, 4.70652880e-04, ...,
        4.39165109e-05, 1.71809056e-04, 5.40575038e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        7.29930203e-03, 6.76447263e-03, 4.58156017e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        5.56251952e-03, 1.37271040e-04, 1.69373845e-04],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 6.68556936e-07],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [10]:
np.save(os.path.join('/home/gwsuh/ksj/SPOT-RNA/', input_file.replace('.ct', '_raw_predict_.npy')), y_pred)