# Download data from Mirbase

In [48]:
'''
base = "https://www.mirbase.org/ftp/CURRENT"        
!wget {base}/miRNA.str.gz -P ./Data/          
!gzip -d ./Data/miRNA.str.gz 
'''

'\nbase = "https://www.mirbase.org/ftp/CURRENT"        \n!wget {base}/miRNA.str.gz -P ./Data/          \n!gzip -d ./Data/miRNA.str.gz \n'

# Convertor

In [72]:
import numpy as np
from tqdm import tqdm, trange  # !pip install tqdm
import pandas as pd
from tqdm.contrib.concurrent import process_map
import multiprocessing as mp

In [50]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame({'tag': tags, 'data': data[1:]})
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [51]:
def isChar(c):
    return not c in [' ', '-', '|']

def adjust(text,n=7):
    text = str(text)    
    return " " * (n - len(text)) + text

def isMature(c1, c2):
    if(c1.isupper() and isChar(c1)):
        return True
    if(c2.isupper() and isChar(c2)):
        return True
    return False

In [52]:
def get_number(strText):
    number = 0 
    for i in range(len(strText[2])):
        if(isChar(strText[2][i]) or isChar(strText[3][i])):
            number += 1    
        if(isChar(strText[4][i])):
            number += 1    
        if(isChar(strText[5][i]) or isChar(strText[6][i])):
            number += 1     
    return number

In [53]:
def get5pInfo(strText):
    has_5p = False
    s_5p = 0
    f_5p = 0
    counter = 0
    for i in range(len(strText[2])):
        if(isChar(strText[2][i]) or isChar(strText[3][i])):
            counter += 1            
            if(isMature(strText[2][i] ,strText[3][i])):
                if(not has_5p):
                    has_5p = True
                    s_5p = counter
                else:
                    continue
            elif(has_5p):
                f_5p = counter - 1 
                break
    return [has_5p, s_5p, f_5p]

In [54]:
def get3pInfo(strText, number):
    has_3p = False
    s_3p = 0
    f_3p = 0
    counter = 0
    for i in range(len(strText[6])):
        if(isChar(strText[6][i]) or isChar(strText[5][i])):
            counter += 1            
            if(isMature(strText[6][i] ,strText[5][i])):
                if(not has_3p):
                    has_3p = True
                    s_3p = counter
                else:
                    continue
            elif(has_3p):
                f_3p = counter - 1 
                break        
    return [has_3p, number - f_3p + 1, number - s_3p + 1]    

In [55]:
#2 [FO:Forward_out]  u    --          a   --    ucua     uu   ----a  u        -             a      gcuu          - a 
#3 [FI:Forward_in ]   uguc  uugagagggg aga  gauc    ugggu  ugg     gg cUGACAGA AGAGAGUGAGCAC cacggu    ucuuagcaug c a
#4 [Mid:Middle    ]   ||||  |||||||||| |||  ||||    |||||  |||     || |||||||| ||||||||||||| ||||||    |||||||||| |  
#5 [RI:Reverse_in ]   acag  gacucucuuc ucu  uuag    acccg  acc     cC GACUGUCU UCUCUCACUCGug gugucg    agggucguac g g
#6 [RO:Reverse_out]  g    uu          c   aa    uaua     -u   acuug  -        A             c      ----          c a 

def convertor(strText, output_path = "CT"):
    !mkdir -p {output_path}
    index  = []
    values = []
    nucludid = []
    def add(i, v, n):        
        index.append(i)
        values.append(v)
        nucludid.append(n)
            
    f_counter = 1
    r_counter = 0
                    
    number = get_number(strText)    
    [FO, FI, Mid, RI, RO] = strText[2:]
    # main loop    
    for i in range(len(Mid)):           
        
        if(isChar(FO[i]) and FI[i] == " " and Mid[i] == " " and RI[i] == " "  and RO[i] == "-" ):
            add(f_counter, 0, FO[i])                
            f_counter += 1                      
                
        elif(FO[i] == "-" and FI[i] == " " and Mid[i] == " " and RI[i] == " "  and isChar(RO[i])):                                            
            add(number - r_counter, 0, RO[i])                
            r_counter += 1                      
        
        elif(isChar(FO[i]) and not isChar(FI[i]) and not isChar(Mid[i]) and not isChar(RI[i]) and isChar(RO[i])):
            add(f_counter, 0, FO[i])                    
            add(number - r_counter, 0, RO[i])                                
            f_counter += 1        
            r_counter += 1                              
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and Mid[i] == "|" and isChar(RI[i]) and not isChar(RO[i])):        
            add(f_counter, number - r_counter, FI[i])                        
            add(number - r_counter, f_counter, RI[i])                
            f_counter += 1        
            r_counter += 1                  
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and isChar(Mid[i]) and isChar(RI[i]) and not isChar(RO[i])):
            add(f_counter, 0, FI[i])
            add(f_counter + 1, 0, Mid[i])
            add(number - r_counter, 0, RI[i])
            f_counter += 2
            r_counter += 1
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and Mid[i] == " " and isChar(RI[i]) and not isChar(RO[i])):                
            add(f_counter, 0, FI[i])                    
            add(number - r_counter, 0, RI[i])                    
            f_counter += 1        
            r_counter += 1                  
            
        elif(not isChar(FO[i]) and not isChar(FI[i]) and isChar(Mid[i]) and not isChar(RI[i]) and not isChar(RO[i])):                
            add(f_counter, 0, Mid[i])                                
            f_counter += 1                    

    # sort indexs and other base on indexes
    inds = np.array(index).argsort()
    index = np.array(index)[inds]
    values = np.array(values)[inds]
    nucludid = np.array(nucludid)[inds]   
    # body
    text = ""    
    for i in range(len(index)):
        text += f"{adjust(index[i],6)} {nucludid[i]} {adjust(i,6)} {adjust((i+2)%(number+1),6)} {adjust(values[i],6)} {adjust(index[i],7)}\n"
    # 5p
    [has_5p, s_5p, f_5p] = get5pInfo(strText)
    # 3p
    [has_3p, s_3p, f_3p] = get3pInfo(strText, number)
    
    # common info
    deltaG = strText[0].split('(')[1].split(')')[0]    
    accession = strText[0].split(' ')[0][1:]
    
    common_header = f"{adjust(number,6)} dG ={adjust(deltaG,10)} {accession}"
    if(has_3p and has_5p):                
        with open(f"{output_path}/{accession}-5p|+|1-{number}|{s_5p}-{f_5p}.ct", 'w') as file:            
            file.write(f"{common_header}-5p|+|1-{number}|{s_5p}-{f_5p}\n" + text)
        with open(f"{output_path}/{accession}-3p|+|1-{number}|{s_3p}-{f_3p}.ct", 'w') as file:            
            file.write(f"{common_header}-3p|+|1-{number}|{s_3p}-{f_3p}\n" + text)
            
    elif(has_5p):        
        with open(f"{output_path}/{accession}|+|1-{number}|{s_5p}-{f_5p}.ct", 'w') as file:
            file.write(f"{common_header}|+|1-{number}|{s_5p}-{f_5p}\n" + text)
            
    elif(has_3p):        
        with open(f"{output_path}/{accession}|+|1-{number}|{s_3p}-{f_3p}.ct", 'w') as file:
            file.write(f"{common_header}|+|1-{number}|{s_3p}-{f_3p}\n" + text)            

In [56]:
strText = lines[0:(0+7)]           
print('\n'.join(strText))
convertor(strText,"CT_high_viridi")

>osa-MIR156b (-94.20)   [osa-miR156b-5p:43-62] [osa-miR156b-3p:110-130]

u    --          a   --    ucua     uu   ----a  u        -             a      gcuu          - a 
 uguc  uugagagggg aga  gauc    ugggu  ugg     gg cUGACAGA AGAGAGUGAGCAC cacggu    ucuuagcaug c a
 ||||  |||||||||| |||  ||||    |||||  |||     || |||||||| ||||||||||||| ||||||    |||||||||| |  
 acag  gacucucuuc ucu  uuag    acccg  acc     cC GACUGUCU UCUCUCACUCGug gugucg    agggucguac g g
g    uu          c   aa    uaua     -u   acuug  -        A             c      ----          c a 


#  Select miRNA.str

In [57]:
directory = 'miRBase_driven_data'

In [58]:
df = fasta_to_df(f'./{directory}/hairpin_high_conf.fa')
df['organism'] = df['tag'].apply(lambda x: x[:3])
df['length'] = df['data'].apply(lambda x: len(x))
print(df.shape)
df.head(2)

(3320, 4)


Unnamed: 0,tag,data,organism,length
0,cel-let-7 MI0000001 Caenorhabditis elegans let...,UACACUGUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAU...,cel,99
1,cel-lin-4 MI0000002 Caenorhabditis elegans lin...,AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAUU...,cel,94


In [59]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [60]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" in x)]
print(selectedTree.shape)
selectedTree.head(2)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352


In [61]:
selected = df[df['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(2)

(160, 4)


Unnamed: 0,tag,data,organism,length
460,osa-MIR156b MI0000654 Oryza sativa miR156b ste...,UUGUCUUGAGAGGGGAAGAGAUCUCUAUGGGUUUUGGAGGUCUGAC...,osa,176
461,osa-MIR156c MI0000655 Oryza sativa miR156c ste...,GGAGGAAGAGAGGGGUGAGAGGUGAGGCUGACAGAAGAGAGUGAGC...,osa,149


In [62]:
tags = list(selected['tag'].apply(lambda x : x.split(' ')[0]))
with open(f'./{directory}/miRNA.str', 'r') as file:
    text = file.read()
text = text.split('\n')

result = ''
for i in range(0,len(text),8):
    if(text[i].split(' ')[0][1:] in tags):
        result += '\n'.join(text[i:i+8]) + "\n"        
with open(f'./high_conf_hairpin.str', 'w') as file:
    file.write(result)

In [63]:
# for all str
'''
with open('./Data/miRNA.str','r') as file:
    text = file.read()
    lines = text.split('\n')
''';

In [64]:
# for high confidence 
with open('./high_conf_hairpin.str','r') as file:
    text = file.read()
    lines = text.split('\n')

#  Run convertor

In [76]:
def run(i):                   
    strText = lines[i:(i+7)]
    if(len(strText) == 7):
        convertor(strText, "CT_high_viridi")    
    else:
        print(strText, "not str structure")
process_map(run , range(0,len(lines),8), tqdm_class=tqdm, max_workers= mp.cpu_count() - 1, chunksize=5);

  1%|          | 1/161 [00:00<01:42,  1.56it/s]

[''] not str structure


100%|██████████| 161/161 [00:01<00:00, 128.33it/s]


In [22]:
!zip -r ct_high_viridi.zip ./CT_high_viridi

  adding: CT_high_viridi/ (stored 0%)
  adding: CT_high_viridi/aly-MIR824-3p|+|1-654|620-640.ct (deflated 76%)
  adding: CT_high_viridi/osa-MIR156f-5p|+|1-186|4-23.ct (deflated 74%)
  adding: CT_high_viridi/zma-MIR156e-5p|+|1-126|20-39.ct (deflated 72%)
  adding: CT_high_viridi/zma-MIR396e-3p|+|1-166|145-165.ct (deflated 75%)
  adding: CT_high_viridi/tae-MIR398|+|1-120|85-105.ct (deflated 73%)
  adding: CT_high_viridi/aly-MIR172a-5p|+|1-149|26-46.ct (deflated 73%)
  adding: CT_high_viridi/tae-MIR9670|+|1-78|49-69.ct (deflated 72%)
  adding: CT_high_viridi/aly-MIR173b-3p|+|1-100|69-89.ct (deflated 73%)
  adding: CT_high_viridi/osa-MIR166l-3p|+|1-117|87-107.ct (deflated 72%)
  adding: CT_high_viridi/osa-MIR167a-5p|+|1-141|14-34.ct (deflated 72%)
  adding: CT_high_viridi/zma-MIR156f-5p|+|1-167|21-40.ct (deflated 73%)
  adding: CT_high_viridi/osa-MIR3980a-5p|+|1-192|31-51.ct (deflated 74%)
  adding: CT_high_viridi/zma-MIR166d-3p|+|1-104|84-103.ct (deflated 73%)
  adding: CT_high_viridi/vvi

  adding: CT_high_viridi/tae-MIR5048|+|1-343|13-34.ct (deflated 73%)
  adding: CT_high_viridi/tae-MIR9655|+|1-122|82-102.ct (deflated 73%)
  adding: CT_high_viridi/osa-MIR156j-3p|+|1-165|108-129.ct (deflated 72%)
  adding: CT_high_viridi/tae-MIR9652-3p|+|1-124|84-103.ct (deflated 72%)
  adding: CT_high_viridi/zma-MIR172c-5p|+|1-123|4-23.ct (deflated 74%)
  adding: CT_high_viridi/osa-MIR166c-3p|+|1-125|91-111.ct (deflated 73%)
  adding: CT_high_viridi/aly-MIR173a-3p|+|1-100|69-89.ct (deflated 73%)
  adding: CT_high_viridi/zma-MIR399b-5p|+|1-122|23-43.ct (deflated 73%)
  adding: CT_high_viridi/zma-MIR159a-3p|+|1-246|207-227.ct (deflated 73%)
  adding: CT_high_viridi/osa-MIR396c-3p|+|1-141|113-133.ct (deflated 72%)
  adding: CT_high_viridi/osa-MIR408-5p|+|1-213|31-51.ct (deflated 75%)
  adding: CT_high_viridi/osa-MIR171c-3p|+|1-99|69-89.ct (deflated 73%)
  adding: CT_high_viridi/osa-MIR1429-3p|+|1-120|99-119.ct (deflated 72%)
  adding: CT_high_viridi/vvi-MIR2950-3p|+|1-107|66