# Common

In [1]:
import math
import numpy as np
import pandas as pd
from tqdm.contrib.concurrent import process_map
from tqdm.notebook import tqdm
tqdm.pandas()
import multiprocessing as mp
import shutil
import glob
import os
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [2]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [3]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [4]:
def reformat(path):
    return path.replace('(','_').replace(')','_').replace('.','').replace(':','_')

In [5]:
def reformatCT(path):
    with open(path, 'r') as file:
        text = file.read()
    text = [l for l in text.split('\n') if len(l) > 0 ] # remove blank lines
    text = '\n'.join(text)
    text = text.replace("\t"," ")
    while("  " in text):
        text = text.replace("  ", " ")
    lines = [l for l in text.split('\n')]
    for i in range(len(lines)):
        if(lines[i][0] == " "):
            lines[i] = lines[i][1:]
        if(lines[i][-1] == " "):
            lines[i] = lines[i][:-1]
    text = '\n'.join(lines)
    return text

In [6]:
def get_ct_data(ct):
    ct = "\n".join(ct.split('\n')[1:])
    df = pd.read_csv(StringIO(ct), sep=" ", header=None)               
    nucleotide = df.iloc[:,1]
    index = df.iloc[:,5]
    values = df.iloc[:,4]
    return [nucleotide, index, values]

# Download data from Mirbase

In [138]:
directory = 'miRBase_driven_data'

In [None]:
base = "https://www.mirbase.org/ftp/CURRENT"        
!rm -r {directory}
!mkdir -p {directory}
!wget {base}/aliases.txt.gz -P ./{directory}/       ; gzip -d ./{directory}/aliases.txt.gz 
!wget {base}/hairpin.fa.gz -P ./{directory}/           ; gzip -d ./{directory}/hairpin.fa.gz 
!wget {base}/hairpin_high_conf.fa.gz -P ./{directory}/ ; gzip -d ./{directory}/hairpin_high_conf.fa.gz 
!wget {base}/mature.fa.gz -P ./{directory}/            ; gzip -d ./{directory}/mature.fa.gz 
!wget {base}/mature_high_conf.fa.gz -P ./{directory}/  ; gzip -d ./{directory}/mature_high_conf.fa.gz
!wget {base}/miRNA.str.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.str.gz 
!wget {base}/miRNA.xls.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.xls.gz 
!wget {base}/organisms.txt.gz -P ./{directory}/        ; gzip -d ./{directory}/organisms.txt.gz

In [139]:
hdf = fasta_to_df(f'./{directory}/hairpin_high_conf.fa')
hdf['organism'] = hdf['tag'].apply(lambda x: x[:3])
hdf['length'] = hdf['data'].apply(lambda x: len(x))
print(hdf.shape)
hdf.head(2)

(3320, 4)


Unnamed: 0,tag,data,organism,length
0,cel-let-7 MI0000001 Caenorhabditis elegans let...,UACACUGUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAU...,cel,99
1,cel-lin-4 MI0000002 Caenorhabditis elegans lin...,AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAUU...,cel,94


In [140]:
alldf = fasta_to_df(f'./{directory}/hairpin.fa')
alldf['organism'] = alldf['tag'].apply(lambda x: x[:3])
alldf['length'] = alldf['data'].apply(lambda x: len(x))
alldf.head(2)

Unnamed: 0,tag,data,organism,length
0,cel-let-7 MI0000001 Caenorhabditis elegans let...,UACACUGUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAU...,cel,99
1,cel-lin-4 MI0000002 Caenorhabditis elegans lin...,AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAUU...,cel,94


In [141]:
nhdf = alldf[~alldf['tag'].isin(hdf['tag'])]

In [142]:
df = nhdf

In [143]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [144]:
items = list(organism['tree'].unique())
items.sort(key=len)

In [145]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" not in x)]
print(selectedTree.shape)
selectedTree.head(2)

(199, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [146]:
selected = df[df['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(2)

(26610, 4)


Unnamed: 0,tag,data,organism,length
32,cel-mir-62 MI0000033 Caenorhabditis elegans mi...,GUGAGUUAGAUCUCAUAUCCUUCCGCAAAAUGGAAAUGAUAUGUAA...,cel,58
46,cel-mir-78 MI0000049 Caenorhabditis elegans mi...,AAUAAAAUAUAUUGUUUCAUAGUGUCCGUAAAAUAACUAGAUUUAU...,cel,96


In [147]:
# use this cell for extracting str files for hairpin.fa
tags = list(selected['tag'].apply(lambda x : x.split(' ')[0]))
with open(f'./{directory}/miRNA.str', 'r') as file:
    text = file.read()
text = text.split('\n')

result = ''
for i in range(0,len(text),8):
    if(text[i].split(' ')[0][1:] in tags):
        result += '\n'.join(text[i:i+8]) + "\n"        
with open(f'./high_conf_hairpin.str', 'w') as file:
    file.write(result)

In [148]:
result = result.split('\n')

In [149]:
def get_path(header): 
    path = []
    tag = header.split(' ')[0][1:]    
    sides = [t.split(']')[0] for t in header.split('[') if ']' in t]        
    for s in sides:
        hit = s.split(":")[-1]
        path.append(f'{tag}|+|1-*|{hit}')
        path.append(f'{tag}-3p|+|1-*|{hit}')
        path.append(f'{tag}-5p|+|1-*|{hit}')
    return path

In [150]:
extra = 30
server_url = "http://jupyter.sysmanager.ir/tree/plant_microRNA_prediction"

def get_data(ct_path, str_header):
    out = {}
    ct = reformatCT(ct_path)    
    header = ct.split('\n')[0]
    [hs, he] = header.split('|')[-1].split('-')
    [hs, he] = [int(hs), int(he)]       
    [nucleotide, index, values] = get_ct_data(ct)
    values = list(values)            
    feature = [*values[max(hs - extra - 1,0) : min(hs + extra, len(values))]]                                     
    nuc_feature = [*nucleotide[max(hs - extra - 1, 0) : min(hs + extra, len(values))]]        
    # start
    if((hs - extra) < 1):
        for i in range(1 - (hs - extra)):
            feature = ['*', *feature]                                
            nuc_feature = ['*', *nuc_feature]                                
    if((hs + extra) > len(values)):
        for i in range((hs + extra) -  len(values)):
            feature = [*feature, '*']                                        
            nuc_feature = [*nuc_feature, '*']                                
    # end
    if((he - extra) < 1):
        for i in range(1 - (he - extra)):
            feature = [*feature, '*']                                        
            nuc_feature = [*nuc_feature, '*']                                        
    feature = [*feature, *values[max(he - extra - 1, 0) : min(he + extra,len(values))]]
    nuc_feature = [*nuc_feature, *nucleotide[max(he - extra - 1, 0) : min(he + extra,len(values))]]                
    if((he + extra) > len(values)):
        for i in range((he + extra) -  len(values)):
            feature = [*feature, '*']                                        
            nuc_feature = [*nuc_feature, '*']                                                            
    columns = [* [f'-{i} hit_start' for i in range(extra,0,-1)],
                'hit_start',
                *[f'+{i} hit_start' for i in range(1,extra+1)],                                               
                *[f'-{i} hit_end' for i in range(extra,0,-1)],
                'hit_end',
                *[f'+{i} hit_end' for i in range(1,extra+1)]]    
        
    tag = str_header.split(' ')[0]    
    organism_name = selectedTree[selectedTree['organism'] == tag.split('-')[0][1:]]['name'].iloc[0]
    feature = [f'=HYPERLINK("{server_url}/{ct_path}","ct")',tag , organism_name, len(values), hs, he, *feature]
    nuc_feature = ['','','','','','', *nuc_feature]
    columns = ['ct', 'tag','organism name','seq_length', 'mir_start', 'mir_end', *columns]          
    df = pd.DataFrame(np.array([feature,nuc_feature]), columns=columns)                
    return df

In [151]:
def get_df(header):
    dfs =[]    
    for path in get_path(header):                                
        ct_path = glob.glob(f'./CT/{path}.ct')
        if(len(ct_path) > 1 ):
            print(ct_path,"****")
        if(len(ct_path) == 0):
            #print(f'there is not ./CT/{path}.ct file')
            continue
        ct_path = ct_path[0]            
        dfs.append(get_data(ct_path, header))  
    return dfs

In [152]:
dfs = []
for d in process_map(get_df , pd.Series([result[i] for i in range(0,len(result),8)]), tqdm_class=tqdm, max_workers=mp.cpu_count()-1, chunksize=5):    
    dfs = [*dfs, *d]
df_result = pd.concat(dfs, axis=0)

  0%|          | 0/26611 [00:00<?, ?it/s]

In [153]:
freq = pd.DataFrame()
temp = df_result.iloc[::2,:]
col = temp.columns
out0 = {}
out1 = {}
for i in range(len(col)):        
    if(i < 5):
        out0[col[i]] = ""
        out1[col[i]] = ""
    if(i == 5):
        out0[col[i]] = "not connected"
        out1[col[i]] = "connected"
    else:    
        out0[col[i]] = round(sum(temp[col[i]] == '0') / sum(temp[col[i]] != '*'), 3)
        out1[col[i]] = round((sum(temp[col[i]] != '0') - sum(temp[col[i]] == '*')) / sum(temp[col[i]] != '*'), 3)
freq = pd.concat([freq, pd.Series(out0).to_frame().T])
freq = pd.concat([freq, pd.Series(out1).to_frame().T])

In [154]:
profile = pd.DataFrame()
temp = df_result.iloc[1::2,:]
col = temp.columns
for c in tqdm(['A','C','G','T','U','R','Y','K','M','S','W','B','D','H','V','N']):    
    out = {}
    for i in range(len(col)):        
        if(i < 2):
            out[col[i]] = ""
        if(i == 2):
            out[col[i]] = c
        else:    
            out[col[i]] = round((sum(temp[col[i]] == c)  + sum(temp[col[i]] == c.lower()) ) / sum(temp[col[i]] != '*'), 3)
    profile = pd.concat([profile, pd.Series(out).to_frame().T])

  0%|          | 0/16 [00:00<?, ?it/s]

In [155]:
df_result = pd.concat([df_result.iloc[::2,:], profile, freq], axis=0)
df_result.to_csv('non_high_confidence_non_viridiplantae.csv', index=False)