In [1]:
import tensorflow as tf
from tensorflow import keras
from scipy import stats
import numpy as np
import h5py
from pathlib import Path

import helper

In [2]:
gpus = tf.config.list_physical_devices('GPU')
print(gpus)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [11]:
# normalization = 'log_norm'   # 'log_norm' or 'clip_norm'
# ss_type = 'seq'                  # 'seq', 'pu', or 'struct'

# data_path = Path('./data/rnacompete2013.h5')
# results_path = Path('./results/rnacomplete_2013')
# save_path = results_path / f'{normalization}_{ss_type}'

# results_path.mkdir(parents=False, exist_ok=True)
# save_path.mkdir(parents=False, exist_ok=True)

In [25]:
# 2013
data_path = Path('./data/rnacompete2013.h5')
experiments = helper.get_experiment_names(data_path)
print(f'The number of experiments: {len(experiments)}')
rbp_index = 0  

print(f'Loading: {experiments[rbp_index]} (index = {rbp_index})')
train, valid, test = helper.load_rnacompete_data(
    data_path, 
    ss_type='seq', 
    normalization= 'log_norm', 
    rbp_index=rbp_index
)

print(train['inputs'].shape)

The number of experiments: 244
Loading: RNCMPT00100 (index = 0)
(108227, 41, 4)


In [23]:
# 2009
data_path = Path('./data/rnacompete2009.h5')
rbp_names = ['VTS1', 'Fusip', 'HuR', 'PTB', 'RBM4', 'SF2', 'SLM2', 'U1A', 'VTS1', 'YB1']
rbp_name = rbp_names[0]

print(f'Loading: {rbp_name}')
train, valid, test = helper.load_rnacompete_data(
    data_path, 
    ss_type=ss_type, 
    normalization=normalization, 
    dataset_name=rbp_name
)

print(train['inputs'].shape)

Loading: VTS1
(37187, 39, 4)


In [28]:
def convert_onehot2seq(onehot, alphabet='ACGU'):
    assert len(onehot.shape) == 3
    assert onehot.shape[2] == 4
    
    base_map = list(alphabet)
    sequences = []

    for i in range(onehot.shape[0]):
        seq = ''
        arr = onehot[i]
        nonzero_mask = np.any(arr != 0, axis=1)
        indices = np.where(nonzero_mask)[0]

        if len(indices) == 0:
            sequences.append('')
            continue

        start, end = indices[0], indices[-1] + 1

        # Ensure contiguous non-zero block
        if not np.all(nonzero_mask[start:end]):
            raise ValueError(f'Non-contiguous non-zero block in sequence {i}')

        idxs = np.argmax(arr[start:end], axis=1)
        seq = ''.join([base_map[j] for j in idxs])
        sequences.append(seq)

    return sequences

In [29]:
onehot = train['inputs']
sequences = convert_onehot2seq(onehot)
sequences[:10]

['AGAUGUCUCGCCGUCCCUGAGCAGCAAAACCUCUC',
 'AGACCAAGUUGUGAAAUAGUCGAGGGAGUCGAAGUGCC',
 'AGAUAUAACAUGUCUAAACAAAUUCUAUACUAGCUGAG',
 'AGGAGUUAACGAUGAACUUUAGGCGCUGUGGAAAUCCC',
 'AGGGACCGAACGAAAACAACAUUCAGAUGUGGCUGCCG',
 'AGGGAGAUGGGCUGUGUAGAACUGAUGAGAGCGAAUUG',
 'AGACGCUUCUGUUUGUGGAAAUUAGCCUUAAUUUU',
 'AGAACGAUCUCAGGUCCACCGAUAAUCCAUGCCAUUCG',
 'AGAGUUGAGAGUAGCCUCUGCCUUCCAGGAUGUAAAAA',
 'AGACCUGAUCCUCUCAUCUGGCCAAAAUGG']