In [1]:
directory = './Temp/miRBase_driven_data'

# Download data from Mirbase

In [45]:
base = "https://www.mirbase.org/ftp/CURRENT"         
!wget {base}/miRNA.str.gz -P ./{directory}/       ; gzip -d ./{directory}/miRNA.str.gz 

--2022-04-29 22:15:30--  https://www.mirbase.org/ftp/CURRENT/miRNA.str.gz
Resolving www.mirbase.org (www.mirbase.org)... 130.88.97.249
Connecting to www.mirbase.org (www.mirbase.org)|130.88.97.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3373763 (3.2M) [application/x-gzip]
Saving to: ‘././Temp/miRBase_driven_data/miRNA.str.gz’

miRNA.str.gz          7%[>                   ] 248.00K  62.7KB/s    eta 50s    ^C


# Convertor

In [2]:
import numpy as np
from tqdm import tqdm, trange  # !pip install tqdm
import pandas as pd
from tqdm.contrib.concurrent import process_map
import multiprocessing as mp

In [3]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame({'tag': tags, 'data': data[1:]})
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [4]:
def isChar(c):
    return not c in [' ', '-', '|']

def adjust(text,n=7):
    text = str(text)    
    return " " * (n - len(text)) + text

def isMature(c1, c2):
    if(c1.isupper() and isChar(c1)):
        return True
    if(c2.isupper() and isChar(c2)):
        return True
    return False

In [5]:
def get_number(strText):
    number = 0 
    for i in range(len(strText[2])):
        if(isChar(strText[2][i]) or isChar(strText[3][i])):
            number += 1    
        if(isChar(strText[4][i])):
            number += 1    
        if(isChar(strText[5][i]) or isChar(strText[6][i])):
            number += 1     
    return number

In [6]:
def get5pInfo(strText):
    has_5p = False
    s_5p = 0
    f_5p = 0
    counter = 0
    for i in range(len(strText[2])):
        if(isChar(strText[2][i]) or isChar(strText[3][i])):
            counter += 1            
            if(isMature(strText[2][i] ,strText[3][i])):
                if(not has_5p):
                    has_5p = True
                    s_5p = counter
                else:
                    continue
            elif(has_5p):
                f_5p = counter - 1 
                break
    return [has_5p, s_5p, f_5p]

In [7]:
def get3pInfo(strText, number):
    has_3p = False
    s_3p = 0
    f_3p = 0
    counter = 0
    for i in range(len(strText[6])):
        if(isChar(strText[6][i]) or isChar(strText[5][i])):
            counter += 1            
            if(isMature(strText[6][i] ,strText[5][i])):
                if(not has_3p):
                    has_3p = True
                    s_3p = counter
                else:
                    continue
            elif(has_3p):
                f_3p = counter - 1 
                break        
    return [has_3p, number - f_3p + 1, number - s_3p + 1]    

In [8]:
#2 [FO:Forward_out]  u    --          a   --    ucua     uu   ----a  u        -             a      gcuu          - a 
#3 [FI:Forward_in ]   uguc  uugagagggg aga  gauc    ugggu  ugg     gg cUGACAGA AGAGAGUGAGCAC cacggu    ucuuagcaug c a
#4 [Mid:Middle    ]   ||||  |||||||||| |||  ||||    |||||  |||     || |||||||| ||||||||||||| ||||||    |||||||||| |  
#5 [RI:Reverse_in ]   acag  gacucucuuc ucu  uuag    acccg  acc     cC GACUGUCU UCUCUCACUCGug gugucg    agggucguac g g
#6 [RO:Reverse_out]  g    uu          c   aa    uaua     -u   acuug  -        A             c      ----          c a 

def convertor(strText, output_path = "CT"):
    !mkdir -p {output_path}
    index  = []
    values = []
    nucludid = []
    def add(i, v, n):        
        index.append(i)
        values.append(v)
        nucludid.append(n)
            
    f_counter = 1
    r_counter = 0
                    
    number = get_number(strText)    
    [FO, FI, Mid, RI, RO] = strText[2:]
    # main loop    
    for i in range(len(Mid)):           
        
        if(isChar(FO[i]) and FI[i] == " " and Mid[i] == " " and RI[i] == " "  and RO[i] == "-" ):
            add(f_counter, 0, FO[i])                
            f_counter += 1                      
                
        elif(FO[i] == "-" and FI[i] == " " and Mid[i] == " " and RI[i] == " "  and isChar(RO[i])):                                            
            add(number - r_counter, 0, RO[i])                
            r_counter += 1                      
        
        elif(isChar(FO[i]) and not isChar(FI[i]) and not isChar(Mid[i]) and not isChar(RI[i]) and isChar(RO[i])):
            add(f_counter, 0, FO[i])                    
            add(number - r_counter, 0, RO[i])                                
            f_counter += 1        
            r_counter += 1                              
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and Mid[i] == "|" and isChar(RI[i]) and not isChar(RO[i])):        
            add(f_counter, number - r_counter, FI[i])                        
            add(number - r_counter, f_counter, RI[i])                
            f_counter += 1        
            r_counter += 1                  
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and isChar(Mid[i]) and isChar(RI[i]) and not isChar(RO[i])):
            add(f_counter, 0, FI[i])
            add(f_counter + 1, 0, Mid[i])
            add(number - r_counter, 0, RI[i])
            f_counter += 2
            r_counter += 1
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and Mid[i] == " " and isChar(RI[i]) and not isChar(RO[i])):                
            add(f_counter, 0, FI[i])                    
            add(number - r_counter, 0, RI[i])                    
            f_counter += 1        
            r_counter += 1                  
            
        elif(not isChar(FO[i]) and not isChar(FI[i]) and isChar(Mid[i]) and not isChar(RI[i]) and not isChar(RO[i])):                
            add(f_counter, 0, Mid[i])                                
            f_counter += 1                    

    # sort indexs and other base on indexes
    inds = np.array(index).argsort()
    index = np.array(index)[inds]
    values = np.array(values)[inds]
    nucludid = np.array(nucludid)[inds]   
    # body
    text = ""    
    for i in range(len(index)):
        text += f"{adjust(index[i],6)} {nucludid[i]} {adjust(i,6)} {adjust((i+2)%(number+1),6)} {adjust(values[i],6)} {adjust(index[i],7)}\n"
    # 5p
    [has_5p, s_5p, f_5p] = get5pInfo(strText)
    # 3p
    [has_3p, s_3p, f_3p] = get3pInfo(strText, number)
    
    # common info
    deltaG = strText[0].split('(')[1].split(')')[0]    
    accession = strText[0].split(' ')[0][1:]
    
    common_header = f"{adjust(number,6)} dG ={adjust(deltaG,10)} {accession}"
    if(has_3p and has_5p):                
        with open(f"{output_path}/{accession}-5p|+|1-{number}|{s_5p}-{f_5p}.ct", 'w') as file:            
            file.write(f"{common_header}-5p|+|1-{number}|{s_5p}-{f_5p}\n" + text)
        with open(f"{output_path}/{accession}-3p|+|1-{number}|{s_3p}-{f_3p}.ct", 'w') as file:            
            file.write(f"{common_header}-3p|+|1-{number}|{s_3p}-{f_3p}\n" + text)
            
    elif(has_5p):        
        with open(f"{output_path}/{accession}|+|1-{number}|{s_5p}-{f_5p}.ct", 'w') as file:
            file.write(f"{common_header}|+|1-{number}|{s_5p}-{f_5p}\n" + text)
            
    elif(has_3p):        
        with open(f"{output_path}/{accession}|+|1-{number}|{s_3p}-{f_3p}.ct", 'w') as file:
            file.write(f"{common_header}|+|1-{number}|{s_3p}-{f_3p}\n" + text)            

In [9]:
with open(f'{directory}/miRNA.str','r') as file:
     lines = file.read().split('\n')
strText = lines[0:(0+7)]           
print('\n'.join(strText))
convertor(strText,"CT_high_viridi")

>cel-let-7 (-42.90)   [cel-let-7-5p:17-38] [cel-let-7-3p:60-81]

------uaca    gga             U              ---  aaua 
          cugu   uccggUGAGGUAG AGGUUGUAUAGUUu   gg    u
          ||||   ||||||||||||| ||||||||||||||   ||     
          gaca   aggCCAUUCCAUC UUUAACGUAUCaag   cc    u
agcuucucaa    --g             U              ugg  acca 


#  Select miRNA.str

In [19]:
temp = df['tag'].apply(lambda x: x.split(' ')[0])

In [23]:
temp

32           cel-mir-62
46           cel-mir-78
92           hsa-mir-95
154       mmu-mir-138-2
156         mmu-mir-141
              ...      
38582     smc-mir-12459
38583     smc-mir-12460
38584     smc-mir-12461
38585    hsa-mir-9902-2
38586     gga-mir-1784b
Name: tag, Length: 35269, dtype: object

In [22]:
df[temp== 'dm-MIR156l']

Unnamed: 0,tag,data,organism,length


In [10]:
hairpin = fasta_to_df(f'./{directory}/hairpin.fa')
print(hairpin.shape)
high_hairpin = fasta_to_df(f'./{directory}/hairpin_high_conf.fa')
print(high_hairpin.shape)
hairpin = hairpin[~hairpin['tag'].isin(high_hairpin['tag'])]
hairpin['organism'] = hairpin['tag'].apply(lambda x: x[:3])
hairpin['length'] = hairpin['data'].apply(lambda x: len(x))
print(hairpin.shape)
df = hairpin
hairpin.head(2)

(38589, 2)
(3320, 2)
(35269, 4)


Unnamed: 0,tag,data,organism,length
32,cel-mir-62 MI0000033 Caenorhabditis elegans mi...,GUGAGUUAGAUCUCAUAUCCUUCCGCAAAAUGGAAAUGAUAUGUAA...,cel,58
46,cel-mir-78 MI0000049 Caenorhabditis elegans mi...,AAUAAAAUAUAUUGUUUCAUAGUGUCCGUAAAAUAACUAGAUUUAU...,cel,96


In [11]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [12]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" in x)]
print(selectedTree.shape)
selectedTree.head(2)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352


In [13]:
selected = df[df['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(2)

(8455, 4)


Unnamed: 0,tag,data,organism,length
168,ath-MIR156a MI0000178 Arabidopsis thaliana miR...,CAAGAGAAACGCAAAGAAACUGACAGAAGAGAGUGAGCACACAAAG...,ath,123
169,ath-MIR156b MI0000179 Arabidopsis thaliana miR...,GCUAGAAGAGGGAGAGAUGGUGAUUGAGGAAUGCAACAGAGAAAAC...,ath,183


In [14]:
tags = list(selected['tag'].apply(lambda x : x.split(' ')[0]))
with open(f'./{directory}/miRNA.str', 'r') as file:
    text = file.read().split('\n')


result = ''
for i in range(0,len(text),8):
    if(text[i].split(' ')[0][1:] in tags):
        result += '\n'.join(text[i:i+8]) + "\n"        
with open(f'./not_high_conf_hairpin.str', 'w') as file:
    file.write(result)

In [15]:
# for high confidence 
with open('./not_high_conf_hairpin.str','r') as file:
    lines = file.read().split('\n')     

#  Run convertor

In [16]:
def run(i):                   
    strText = lines[i:(i+7)]
    if(len(strText) == 7):
        convertor(strText, "CT_not_high_viridi")    
    else:
        print(strText, "not str structure")
process_map(run , range(0,len(lines),8), tqdm_class=tqdm, max_workers= mp.cpu_count() - 1, chunksize=5);

 98%|█████████▊| 8316/8456 [00:44<00:00, 185.01it/s]

[''] not str structure


100%|██████████| 8456/8456 [00:45<00:00, 186.99it/s]


In [None]:
!zip 