# Common

In [1]:
import math
import numpy as np
import pandas as pd
from tqdm.contrib.concurrent import process_map
from tqdm.notebook import tqdm
tqdm.pandas()
import multiprocessing as mp
import shutil
import glob
import os
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [2]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [3]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [4]:
def reformat(path):
    return path.replace('(','_').replace(')','_').replace('.','').replace(':','_')

In [5]:
def reformatCT(path):
    with open(path, 'r') as file:
        text = file.read()
    text = [l for l in text.split('\n') if len(l) > 0 ] # remove blank lines
    text = '\n'.join(text)
    text = text.replace("\t"," ")
    while("  " in text):
        text = text.replace("  ", " ")
    lines = [l for l in text.split('\n')]
    for i in range(len(lines)):
        if(lines[i][0] == " "):
            lines[i] = lines[i][1:]
        if(lines[i][-1] == " "):
            lines[i] = lines[i][:-1]
    text = '\n'.join(lines)
    return text

In [6]:
def get_ct_data(ct):
    ct = "\n".join(ct.split('\n')[1:])
    df = pd.read_csv(StringIO(ct), sep=" ", header=None)               
    nucleotide = df.iloc[:,1]
    index = df.iloc[:,5]
    values = df.iloc[:,4]
    return [nucleotide, index, values]

# Download data from Mirbase

In [9]:
directory = 'miRBase_driven_data'

In [10]:
base = "https://www.mirbase.org/ftp/CURRENT"        
!rm -r {directory}
!mkdir -p {directory}
!wget {base}/aliases.txt.gz -P ./{directory}/       ; gzip -d ./{directory}/aliases.txt.gz 
!wget {base}/hairpin.fa.gz -P ./{directory}/           ; gzip -d ./{directory}/hairpin.fa.gz 
!wget {base}/hairpin_high_conf.fa.gz -P ./{directory}/ ; gzip -d ./{directory}/hairpin_high_conf.fa.gz 
!wget {base}/mature.fa.gz -P ./{directory}/            ; gzip -d ./{directory}/mature.fa.gz 
!wget {base}/mature_high_conf.fa.gz -P ./{directory}/  ; gzip -d ./{directory}/mature_high_conf.fa.gz
!wget {base}/miRNA.str.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.str.gz 
!wget {base}/miRNA.xls.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.xls.gz 
!wget {base}/organisms.txt.gz -P ./{directory}/        ; gzip -d ./{directory}/organisms.txt.gz

--2021-12-01 19:38:13--  https://www.mirbase.org/ftp/CURRENT/aliases.txt.gz
Resolving www.mirbase.org (www.mirbase.org)... 130.88.97.249
Connecting to www.mirbase.org (www.mirbase.org)|130.88.97.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480536 (469K) [application/x-gzip]
Saving to: ‘./miRBase_driven_data/aliases.txt.gz’


2021-12-01 19:38:15 (393 KB/s) - ‘./miRBase_driven_data/aliases.txt.gz’ saved [480536/480536]

--2021-12-01 19:38:15--  https://www.mirbase.org/ftp/CURRENT/hairpin.fa.gz
Resolving www.mirbase.org (www.mirbase.org)... 130.88.97.249
Connecting to www.mirbase.org (www.mirbase.org)|130.88.97.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1547350 (1.5M) [application/x-gzip]
Saving to: ‘./miRBase_driven_data/hairpin.fa.gz’


2021-12-01 19:38:17 (951 KB/s) - ‘./miRBase_driven_data/hairpin.fa.gz’ saved [1547350/1547350]

--2021-12-01 19:38:17--  https://www.mirbase.org/ftp/CURRENT/hairpin_high_conf.fa.gz
Resol

In [230]:
hdf = fasta_to_df(f'./{directory}/hairpin_high_conf.fa')
hdf['organism'] = hdf['tag'].apply(lambda x: x[:3])
hdf['length'] = hdf['data'].apply(lambda x: len(x))
print(hdf.shape)
hdf.head(2)

(3320, 4)


Unnamed: 0,tag,data,organism,length
0,cel-let-7 MI0000001 Caenorhabditis elegans let...,UACACUGUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAU...,cel,99
1,cel-lin-4 MI0000002 Caenorhabditis elegans lin...,AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAUU...,cel,94


In [231]:
alldf = fasta_to_df(f'./{directory}/hairpin.fa')
alldf['organism'] = alldf['tag'].apply(lambda x: x[:3])
alldf['length'] = alldf['data'].apply(lambda x: len(x))
alldf.head(2)

Unnamed: 0,tag,data,organism,length
0,cel-let-7 MI0000001 Caenorhabditis elegans let...,UACACUGUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAU...,cel,99
1,cel-lin-4 MI0000002 Caenorhabditis elegans lin...,AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAUU...,cel,94


In [232]:
nhdf = alldf[~alldf['tag'].isin(hdf['tag'])]

In [233]:
df = nhdf

In [247]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [248]:
items = list(organism['tree'].unique())
items.sort(key=len)

In [249]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;"  in x)]
print(selectedTree.shape)
selectedTree.head(2)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352


In [250]:
selected = df[df['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(2)

(8455, 4)


Unnamed: 0,tag,data,organism,length
168,ath-MIR156a MI0000178 Arabidopsis thaliana miR...,CAAGAGAAACGCAAAGAAACUGACAGAAGAGAGUGAGCACACAAAG...,ath,123
169,ath-MIR156b MI0000179 Arabidopsis thaliana miR...,GCUAGAAGAGGGAGAGAUGGUGAUUGAGGAAUGCAACAGAGAAAAC...,ath,183


In [251]:
# use this cell for extracting str files for hairpin.fa
tags = list(selected['tag'].apply(lambda x : x.split(' ')[0]))
with open(f'./{directory}/miRNA.str', 'r') as file:
    text = file.read()
text = text.split('\n')

result = ''
for i in range(0,len(text),8):
    if(text[i].split(' ')[0][1:] in tags):
        result += '\n'.join(text[i:i+8]) + "\n"        
with open(f'./high_conf_hairpin.str', 'w') as file:
    file.write(result)

In [252]:
result = result.split('\n')

In [253]:
def get_path(header): 
    path = []
    tag = header.split(' ')[0][1:]    
    sides = [t.split(']')[0] for t in header.split('[') if ']' in t]        
    for s in sides:
        hit = s.split(":")[-1]
        path.append(f'{tag}|+|1-*|{hit}')
        path.append(f'{tag}-3p|+|1-*|{hit}')
        path.append(f'{tag}-5p|+|1-*|{hit}')
    return path

In [254]:
extra = 5
server_url = "http://jupyter.sysmanager.ir/tree/plant_microRNA_prediction"

def get_data(ct_path, str_header):
    out = {}
    ct = reformatCT(ct_path)
    header = ct.split('\n')[0]
    [hs, he] = header.split('|')[-1].split('-')
    [hs, he] = [int(hs), int(he)]        
    [nucleotide, index, values] = get_ct_data(ct)
    values = list(values)        
    if((hs - extra) < 1 or (he + extra) > len(values)):
        feature = [*values[max(hs - extra - 1,0) : hs + extra], *values[he - extra - 1 : min(he + extra,len(values))]]                  
        nuc_feature = [*nucleotide[max(hs - extra - 1,0) : hs + extra], *nucleotide[he - extra - 1 : min(he + extra,len(values))]]                  
        if((hs - extra) < 1):
            for i in range(1 - (hs - extra)):
                feature = ['*', *feature]                                
                nuc_feature = ['*', *nuc_feature]                                
        if((he + extra) > len(values)):
            for i in range((he + extra) -  len(values)):
                feature = [*feature, '*']                                        
                nuc_feature = [*nuc_feature, '*']                                                        
    else:
        feature = [*values[hs - extra - 1 : hs + extra], *values[he - extra - 1 : he + extra]]                    
        nuc_feature = [*nucleotide[hs - extra - 1 : hs + extra], *nucleotide[he - extra - 1 : he + extra]]                    
    columns = [* [f'-{i} hit_start' for i in range(extra,0,-1)],
                'hit_start',
                *[f'+{i} hit_start' for i in range(1,extra+1)],                                               
                *[f'-{i} hit_end' for i in range(extra,0,-1)],
                'hit_end',
                *[f'+{i} hit_end' for i in range(1,extra+1)]]    
        
    tag = str_header.split(' ')[0]    
    organism_name = selectedTree[selectedTree['organism'] == tag.split('-')[0][1:]]['name'].iloc[0]
    feature = [f'=HYPERLINK("{server_url}/{ct_path}","ct")',tag , organism_name, *feature]
    nuc_feature = ['','','', * nuc_feature]
    columns = ['ct', 'tag','organism name', *columns]        
    df = pd.DataFrame(np.array([feature,nuc_feature]),columns=columns)            
    return df

In [255]:
def get_df(header):
    dfs =[]    
    for path in get_path(header):                                
        ct_path = glob.glob(f'./CT/{path}.ct')
        if(len(ct_path) > 1 ):
            print(ct_path,"****")
        if(len(ct_path) == 0):
            #print(f'there is not ./CT/{path}.ct file')
            continue
        ct_path = ct_path[0]            
        dfs.append(get_data(ct_path, header))  
    return dfs

In [256]:
dfs = []
for d in process_map(get_df , pd.Series([result[i] for i in range(0,len(result),8)]), tqdm_class=tqdm, max_workers=mp.cpu_count()-1, chunksize=5):    
    dfs = [*dfs, *d]
df_result = pd.concat(dfs, axis=0)

  0%|          | 0/8456 [00:00<?, ?it/s]

In [257]:
freq = pd.DataFrame()
temp = df_result.iloc[::2,:]
col = temp.columns
out0 = {}
out1 = {}
for i in range(len(col)):        
    if(i < 2):
        out0[col[i]] = ""
        out1[col[i]] = ""
    if(i == 2):
        out0[col[i]] = "not connected"
        out1[col[i]] = "connected"
    else:    
        out0[col[i]] = round(sum(temp[col[i]] == '0') / sum(temp[col[i]] != '*'), 3)
        out1[col[i]] = round((sum(temp[col[i]] != '0') - sum(temp[col[i]] == '*')) / sum(temp[col[i]] != '*'), 3)
freq = pd.concat([freq, pd.Series(out0).to_frame().T])
freq = pd.concat([freq, pd.Series(out1).to_frame().T])

In [258]:
profile = pd.DataFrame()
temp = df_result.iloc[1::2,:]
col = temp.columns
for c in tqdm(['A','C','G','T','U','R','Y','K','M','S','W','B','D','H','V','N']):    
    out = {}
    for i in range(len(col)):        
        if(i < 2):
            out[col[i]] = ""
        if(i == 2):
            out[col[i]] = c
        else:    
            out[col[i]] = round((sum(temp[col[i]] == c)  + sum(temp[col[i]] == c.lower()) ) / sum(temp[col[i]] != '*'), 3)
    profile = pd.concat([profile, pd.Series(out).to_frame().T])

  0%|          | 0/16 [00:00<?, ?it/s]

In [259]:
df_result = pd.concat([df_result.iloc[::2,:], profile, freq], axis=0)
df_result.to_csv('non_high_confidence_viridiplantae.csv', index=False)