In [1]:
import os
import sys
import argparse
import itertools
import subprocess
import pandas as pd
import multiprocessing
from itertools import cycle
from libs import functions
from datetime import datetime
from multiprocessing import Pool
from subprocess import run, PIPE
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
def valid_file(param):
    base, ext = os.path.splitext(param)
    if ext.lower() not in ('.csv', '.fasta','.fa'):
        raise argparse.ArgumentTypeError('File must have a csv or fasta extension')
    return param


def check_arg(args=None):
    parser = argparse.ArgumentParser(description='Avoidance calculator script')
    parser.add_argument('-m', '--mrna',
                        type=valid_file,
                        help='mrna in csv or fasta format',
                        required='True')
    parser.add_argument('-n', '--ncrna',
                        type=valid_file,
                        help='ncrna in csv or fasta format',
                        required='True')
    parser.add_argument('-l','--length',
                        help='length to calculate interactions for. default = 30 nt')
    parser.add_argument('-o', '--output',
                        help='Output file name.',
                        default = 'avoidance')
    parser.add_argument('-p','--processes',
                        type=int,
                        help='number of process to spawn. Default = 16')

    results = parser.parse_args(args)
    return (results.mrna,
            results.ncrna,
            results.length,
            results.output,
            results.processes)


def interaction_calc(seq):
    proc = run(['RNAup', '-b','-o','--interaction_first'], stdout=PIPE,stderr=subprocess.DEVNULL,
               input=str.encode(seq)) #input is a stdin object so encode input str
    return str(proc.stdout).replace("\\n"," ").replace("b'","")

In [9]:
# %%timeit
mrna = functions.fasta_to_dataframe("test.fa")
ncrna = functions.fasta_to_dataframe("test_ncrna.fa")
mrna['mrna_seq'] = '>' + mrna[1].map(str) + '\n' + mrna[0].map(str).str[:30] + '\n'
ncrna['ncrna_seq'] = '>' + ncrna[1].map(str) + '\n' + ncrna[0].map(str) + '\n'
mrna_seq = [rows['mrna_seq'] for index,rows in mrna.iterrows()]   
ncrna_seq = [rows['ncrna_seq'] for index,rows in ncrna.iterrows()]
index = pd.MultiIndex.from_product([mrna_seq , ncrna_seq], names = ['mrna', 'ncrna'])
sequence_df = pd.DataFrame(index = index).reset_index()
df = sequence_df.pivot(index='mrna',columns='ncrna',values='ncrna')
df['interaction_first'] = df.reset_index().values.sum(axis=1)

In [10]:
total_pairs = df.shape[0]
my_pool = Pool(4)
interactions = []
functions.progress(0,total_pairs)
for i in my_pool.imap_unordered(interaction_calc, df['interaction_first'], chunksize = int(total_pairs/4)):
    interactions.append(i)
    functions.progress(len(interactions),total_pairs)

my_pool.close()
my_pool.join()



In [11]:
seq_id = pd.Series(interactions).str.extractall(r'(>[\S]+)')[0].str.replace('>', '', regex=True).to_frame()
ncrna_id = (seq_id.loc[pd.IndexSlice[:, 1:], :]).reset_index().set_index('level_0')
mrna_id = (seq_id.loc[pd.IndexSlice[:, 0], :]).reset_index().set_index('level_0')
binding_energy = pd.Series(interactions).str.extractall(r'(\(-[0-9]+\.[0-9]+)')[0].str.replace('(', '', regex=True).to_frame().reset_index().set_index('level_0')
d = pd.concat([mrna_id.iloc[:,[1]], ncrna_id.iloc[:,[1]]], axis=1)
match = cycle(list(range(len(ncrna))))
d['match'] = [next(match) for i in range(len(d))]

In [12]:
d = d.reset_index()
d['level_0']=d['level_0'].astype(int)

binding_energy = binding_energy.reset_index()
binding_energy['level_0']=binding_energy['level_0'].astype(int)
binding_energy['match']=binding_energy['match'].astype(int)
d = pd.merge(binding_energy.reset_index(), d.reset_index(), on=['level_0','match'])

In [13]:
d = d.iloc[:,[5,6,3]]
d.columns = ['Accession', 'ncRNA', 'Binding_energy']