# Common

In [1]:
import numpy as np
import pandas as pd
from tqdm.contrib.concurrent import process_map
#from tqdm import tqdm  # !pip install tqdm
from tqdm.notebook import tqdm
tqdm.pandas()
#from tqdm import trange
import multiprocessing as mp
import shutil
import glob
import os
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [2]:
!mkdir -p Temp

In [3]:
def bracket_row(row):    
    s = row['data']
    index = min(s.find('.'), s.find('('))
    data = row['data']
    row['data'] = data[0:index]
    row['bracket'] = data[index:]
    return row

In [4]:
def adjust(text,n=7):
    text = str(text)    
    return " " * (n - len(text)) + text

In [5]:
def bracket_to_ct(tag, data, bracket, deltaG, negative_deltaG=True):    
    deltaG = deltaG.replace('(','').replace(')','')
    deltaG = float(deltaG)
    if(deltaG > 0 and negative_deltaG ): # negetive?!
        deltaG = -1 * deltaG
    stack = []
    index = np.zeros((len(bracket)), dtype = int)
    values = np.zeros((len(bracket)), dtype = int)
    for i in range(len(bracket)):
        index[i] = i + 1
        if(bracket[i] == '.'):
            values[i] = 0
        elif(bracket[i] == '('):
            stack.append(i)
        elif(bracket[i] == ')'):
            if(len(stack) == 0 ):
                print('structure error!')
            values[stack[-1]] = i + 1
            values[i]  = stack[-1] + 1
            stack.pop()
        else:
            print('structure error!')
    if(len(stack) != 0 ):
        print('structure error!')
    # body    
    ct = f"{adjust(len(data),6)} dG ={adjust(deltaG,10)} {tag}\n"   
    for i in range(len(bracket)):
        ct += f"{adjust(index[i],6)} {data[i]} {adjust(i,6)} {adjust((i+2)%(len(data)+1),6)} {adjust(values[i],6)} {adjust(index[i],7)}\n"
    return ct

In [6]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [7]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [8]:
def reformat(path):
    return path.replace('(','_').replace(')','_').replace('.','').replace(':','_')

In [9]:
def reformatCT(path):
    with open(path, 'r') as file:
        text = file.read()
    text = [l for l in text.split('\n') if len(l) > 0 ] # remove blank lines
    text = '\n'.join(text)
    text = text.replace("\t"," ")
    while("  " in text):
        text = text.replace("  ", " ")
    lines = [l for l in text.split('\n')]
    for i in range(len(lines)):
        if(lines[i][0] == " "):
            lines[i] = lines[i][1:]
        if(lines[i][-1] == " "):
            lines[i] = lines[i][:-1]
    text = '\n'.join(lines)
    return text

In [10]:
def get_ct_data(ct):
    ct = "\n".join(ct.split('\n')[1:])
    df = pd.read_csv(StringIO(ct), sep=" ", header=None)               
    nucleotide = df.iloc[:,1]
    index = df.iloc[:,5]
    values = df.iloc[:,4]
    return [nucleotide, index, values]

In [11]:
def ct2dot_bracket(path):
    [nucleotide, index, values] = get_ct_data(reformatCT(path))
    text = ''.join(nucleotide) + "\n"
    watch = []
    for i, v in zip(index,values):
        if(v == 0):
            text += '.'
        else:
            if( v not in watch):
                text += '('
                watch.append(i)
            if( v in watch):
                text += ')'
    return text

In [12]:
def is_nested(index, values):
    max_value = max(index) + 10 # inf
    for i, v in zip(index, values):
        if(v < max_value and v != 0):
            max_value  = v
        if(i >= max_value):
            max_value = max(index) + 10 # inf
        if(v > max_value):
            return False               
    return True

In [13]:
'''ct = reformatCT('./secondary_structure/spot_rna/AMWY020598281_2832-3256_+_/AMWY020598281_2832-3256_+_.ct')
[nucleotide, index, values] = get_ct_data(ct)
print(is_nested( index,  values))
''';

### rename tag of input genome to new tag id

# Download dataset

In [13]:
'''
from Bio import Entrez
Entrez.email = "abolhasani.eliya@gmail.com"     
with Entrez.esearch(db='nucleotide', term="Arabidopsis thaliana") as handle:
    result = Entrez.read(handle)

print(result)
genome_ids = result['IdList']

for genome_id in genome_ids:
    print(genome_id)
    record = Entrez.efetch(db="nucleotide", id=genome_id, rettype="fasta", retmode="text")        
    with open(f'{genome_id}.fasta', 'w') as f:
        f.write(record.read())
    break
''';
'''
from Bio import Entrez
Entrez.email = "abolhasani.eliya@gmail.com"     
record = Entrez.efetch(db="nucleotide", id="NC_054143.4", rettype="fasta", retmode="text")        
with open(f'data.fasta', 'w') as f:
    f.write(record.read())
''';

In [14]:
!wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/439/995/GCA_000439995.3_AzaInd2.1/GCA_000439995.3_AzaInd2.1_genomic.fna.gz

--2021-11-14 18:10:04--  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/439/995/GCA_000439995.3_AzaInd2.1/GCA_000439995.3_AzaInd2.1_genomic.fna.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 130.14.250.7, 2607:f220:41e:250::11, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85647577 (82M) [application/x-gzip]
Saving to: ‘GCA_000439995.3_AzaInd2.1_genomic.fna.gz’

CA_000439995.3_AzaI  27%[====>               ]  22.82M  7.26MB/s    eta 8s     ^C


In [263]:
!gzip -d ./GCA_000439995.3_AzaInd2.1_genomic.fna.gz

# Download data from Mirbase

In [28]:
directory = 'miRBase_driven_data'

In [18]:
base = "https://www.mirbase.org/ftp/CURRENT"        
!rm -r {directory}
!mkdir -p {directory}
!wget {base}/aliases.txt.gz -P ./{directory}/       ; gzip -d ./{directory}/aliases.txt.gz 
!wget {base}/hairpin.fa.gz -P ./{directory}/           ; gzip -d ./{directory}/hairpin.fa.gz 
!wget {base}/hairpin_high_conf.fa.gz -P ./{directory}/ ; gzip -d ./{directory}/hairpin_high_conf.fa.gz 
!wget {base}/mature.fa.gz -P ./{directory}/            ; gzip -d ./{directory}/mature.fa.gz 
!wget {base}/mature_high_conf.fa.gz -P ./{directory}/  ; gzip -d ./{directory}/mature_high_conf.fa.gz
!wget {base}/miRNA.str.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.str.gz 
!wget {base}/miRNA.xls.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.xls.gz 
!wget {base}/organisms.txt.gz -P ./{directory}/        ; gzip -d ./{directory}/organisms.txt.gz

--2021-11-14 19:08:31--  https://www.mirbase.org/ftp/CURRENT/aliases.txt.gz
Resolving www.mirbase.org (www.mirbase.org)... 130.88.97.249
Connecting to www.mirbase.org (www.mirbase.org)|130.88.97.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480536 (469K) [application/x-gzip]
Saving to: ‘./miRBase_driven_data/aliases.txt.gz’


2021-11-14 19:08:32 (469 KB/s) - ‘./miRBase_driven_data/aliases.txt.gz’ saved [480536/480536]

--2021-11-14 19:08:33--  https://www.mirbase.org/ftp/CURRENT/hairpin.fa.gz
Resolving www.mirbase.org (www.mirbase.org)... 130.88.97.249
Connecting to www.mirbase.org (www.mirbase.org)|130.88.97.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1547350 (1.5M) [application/x-gzip]
Saving to: ‘./miRBase_driven_data/hairpin.fa.gz’


2021-11-14 19:08:34 (995 KB/s) - ‘./miRBase_driven_data/hairpin.fa.gz’ saved [1547350/1547350]

--2021-11-14 19:08:35--  https://www.mirbase.org/ftp/CURRENT/hairpin_high_conf.fa.gz
Resol

In [29]:
df = fasta_to_df(f'./{directory}/mature.fa')
#df = fasta_to_df('./Data/mature_high_conf.fa')
df['organism'] = df['tag'].apply(lambda x: x[:3])
print(df.shape)
df.head(2)

(48885, 3)


Unnamed: 0,tag,data,organism
0,cel-let-7-5p MIMAT0000001 Caenorhabditis elega...,UGAGGUAGUAGGUUGUAUAGUU,cel
1,cel-let-7-3p MIMAT0015091 Caenorhabditis elega...,CUAUGCAAUUUUCUACCUUACC,cel


In [30]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [31]:
items = list(organism['tree'].unique())
items.sort(key=len)
items

['Viruses;',
 'Mycetozoa;',
 'Alveolata;',
 'Metazoa;Porifera;',
 'Metazoa;Cnidaria;',
 'Viridiplantae;Chlorophyta;',
 'Viridiplantae;Embryophyta;',
 'Viridiplantae;Coniferophyta;',
 'Viridiplantae;Magnoliophyta;',
 'Metazoa;Bilateria;Deuterostoma;',
 'Chromalveolata;Heterokontophyta;',
 'Metazoa;Bilateria;Ecdysozoa;Nematoda;',
 'Metazoa;Bilateria;Lophotrochozoa;Annelida;',
 'Metazoa;Bilateria;Lophotrochozoa;Nemertea;',
 'Metazoa;Bilateria;Lophotrochozoa;Mollusca;',
 'Viridiplantae;Magnoliophyta;monocotyledons;',
 'Metazoa;Bilateria;Deuterostoma;Hemichordata;',
 'Metazoa;Bilateria;Deuterostoma;Echinodermata;',
 'Metazoa;Bilateria;Lophotrochozoa;Brachiopoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Hexapoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Crustacea;',
 'Metazoa;Bilateria;Lophotrochozoa;Platyhelminthes;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Chelicerata;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Mandibulata;',
 'Viridiplantae;Magnoliophyta;eudicotyledons;Poaceae;',
 'M

In [32]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" in x)]
print(selectedTree.shape)
selectedTree.head(5)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352
68,ppt,PPT,Physcomitrella patens,Viridiplantae;Embryophyta;,3218
69,smo,SMO,Selaginella moellendorffii,Viridiplantae;Embryophyta;,88036
70,ath,ATH,Arabidopsis thaliana,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3702


In [33]:
selected = df[df['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head()

(10414, 3)


Unnamed: 0,tag,data,organism
316,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...,UGACAGAAGAGAGUGAGCAC,ath
317,ath-miR156a-3p MIMAT0031865 Arabidopsis thalia...,GCUCACUGCUCUUUCUGUCAGA,ath
318,ath-miR156b-5p MIMAT0000167 Arabidopsis thalia...,UGACAGAAGAGAGUGAGCAC,ath
319,ath-miR156b-3p MIMAT0031866 Arabidopsis thalia...,UGCUCACCUCUCUUUCUGUCAGU,ath
320,ath-miR156c-5p MIMAT0000168 Arabidopsis thalia...,UGACAGAAGAGAGUGAGCAC,ath


In [34]:
df_to_fasta(selected,'./Temp/mature_microRNA_queries.fasta')

# Remove redundant

## cdhit-est

In [41]:
!cdhit/cd-hit-est -i ./Temp/mature_microRNA_queries.fasta  -o ./Temp/NR_mature_microRNA_queries.fasta \
    -c 1 -r 0 -G 1 -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS 0 \
    -AS 99999999 -s 0 -S 0 

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 23 2021, 21:45:39
Command: cdhit/cd-hit-est -i
         ./Temp/mature_microRNA_queries.fasta -o
         ./Temp/NR_mature_microRNA_queries.fasta -c 1 -r 0 -G 1
         -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS 0 -AS 99999999
         -s 0 -S 0

Started: Sun Nov 14 19:23:14 2021
                            Output                              
----------------------------------------------------------------
total seq: 10414
longest and shortest : 28 and 17
Total letters: 222978
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 1 X 12M = 12M
Table           : 1 X 16M = 16M
Miscellaneous   : 0M
Total           : 30M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 96149440

comparing sequences from          0  to      10414
..........    10000  finished       5817  clusters

    10414  finished       6028  clusters

Approximate

## reformat

In [42]:
with open('./Temp/NR_mature_microRNA_queries.fasta.clstr','r') as file:
    text = file.read()
lines = [line for line in text.split('\n') if len(line) > 0]
cluster = []
seqid = []
last_cluster = ""
for l in lines:
    if(l[0]=='>'):        
        last_cluster = l.replace('>Cluster ',"C")
    else:        
        cluster.append(last_cluster)
        seqid.append(l.split(', >')[1].split('...')[0])                
seq2cluster = pd.DataFrame({'seqid': seqid,'cluster': cluster})
print(seq2cluster.shape)
seq2cluster.head(2)    

(10414, 2)


Unnamed: 0,seqid,cluster
0,cst-miR11332,C0
1,stu-miR7994b-5p,C1


In [43]:
df = fasta_to_df("./Temp/mature_microRNA_queries.fasta")
df['accession'] = df['tag'].apply(lambda x : x.split(' ')[0])
seq2cluster = pd.merge(df,seq2cluster,how="inner",left_on='accession',right_on="seqid")[['cluster','seqid','tag']]
print(seq2cluster.shape)
display(seq2cluster.head(2))
seq2cluster.to_csv('./Temp/seq2cluster.csv',index=False)

(10414, 3)


Unnamed: 0,cluster,seqid,tag
0,C5495,ath-miR156a-5p,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...
1,C1199,ath-miR156a-3p,ath-miR156a-3p MIMAT0031865 Arabidopsis thalia...


In [44]:
# todo: sorted first by cluster then by seqid
seq2cluster.sort_values("cluster").head(2)

Unnamed: 0,cluster,seqid,tag
9422,C0,cst-miR11332,cst-miR11332 MIMAT0044622 Cucumis sativus miR1...
7002,C1,stu-miR7994b-5p,stu-miR7994b-5p MIMAT0031188 Solanum tuberosum...


In [45]:
df = fasta_to_df("./Temp/NR_mature_microRNA_queries.fasta")
df['tag'] = df['tag'].apply(lambda x : x.split(' ')[0])
df = pd.merge(df,seq2cluster,how="inner",left_on='tag',right_on="seqid")[['cluster','data']]

lines = []
df.apply(lambda row: lines.append(f">{row['cluster']}\n{row['data']}\n"),axis=1)
print(df.shape)
with open('./Temp/BLASTn_queries.fasta','w') as file:
    file.write(''.join(lines))

(6028, 2)


# BlastN

!sudo apt-get install ncbi-blast+


In [46]:
!makeblastdb -in input_genome.fna \
             -dbtype nucl \
             -out ./Temp/blastn_database



Building a new DB, current time: 11/14/2021 19:23:46
New DB name:   /home/jupyter/plant_microRNA_prediction/Temp/blastn_database
New DB title:  input_genome.fna
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 126142 sequences in 9.84965 seconds.


In [17]:
header = 'qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'

In [18]:
!blastn -query ./Temp/BLASTn_queries.fasta \
        -out ./Temp/BLASTn_result \
        -num_threads {mp.cpu_count()} \
        -db ./Temp/blastn_database \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [19]:
df_blastn = pd.read_csv('./Temp/BLASTn_result', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(326849, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C5495,AMWY02099822.1,1,20,1769,1750,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.004,37.4,...,0,100.0,1/-1,1,-1,minus,100,100,20,3308
1,C5495,AMWY02082313.1,1,20,5954,5973,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.004,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,8471


In [20]:
threshold = 4
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(80217, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C5495,AMWY02099822.1,1,20,1769,1750,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.004,37.4,...,100.0,1/-1,1,-1,minus,100,100,20,3308,0
1,C5495,AMWY02082313.1,1,20,5954,5973,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.004,37.4,...,100.0,1/1,1,1,plus,100,100,20,8471,0


In [21]:
# remore redundancy and hold best one base of Nonconformity value
df_blastn = df_blastn.sort_values(["Nonconformity", "evalue"], ascending = (True, True))
df_blastn = df_blastn.drop_duplicates(subset=['sseqid','sstart', 'send','sstrand'], keep='first')
df_blastn.to_csv('./Temp/filtered_out_blastn.csv')
print(df_blastn.shape)

(66445, 28)


# Result of the blastn to bed file

In [22]:
flanking_value = 200
df = df_blastn[['sseqid', 'sstart', 'send', 'sstrand','slen']]
df['ones'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ones'] = 1


In [23]:
def switch(row):
    if(row['sstart'] > row['send']):        
        temp = row['sstart']
        row['sstart'] = row['send']
        row['send'] = temp
    return row
df = df.apply(lambda row: switch(row), axis=1)

In [24]:
def convert(inp):
    if(inp == "plus"):
        return "forward"
    if(inp == "minus"):
        return "reverse"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['strand'] = df['sstrand'].apply(lambda x: convert(x))

In [25]:
def convert2sign(inp):
    if(inp == "plus"):
        return "+"
    if(inp == "minus"):
        return "-"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['sign'] = df['sstrand'].apply(lambda x: convert2sign(x))

In [26]:
df['hit_length'] = df.apply(lambda row: abs(row['send'] - row['sstart']) + 1 ,axis=1)

## convert sstart and send from location to index (range)

In [27]:
df['sstart'] = df['sstart'].apply(lambda x: x - 1)

In [28]:
df['downstream_flanking'] = df['sstart'].apply(lambda x:  flanking_value if x > flanking_value else x)

In [29]:
df['upstream_flanking'] = df.apply(lambda row:  flanking_value if (row['send']+flanking_value) <= row['slen'] else row['slen'] - row['send'],axis=1)

In [30]:
df['hit_start'] = df.apply(lambda row: row['downstream_flanking'] if row['sign'] == "+" else row['upstream_flanking'],axis=1)

In [31]:
df['hit_end'] = df.apply(lambda row: row['downstream_flanking'] + row['hit_length'] if row['sign'] == "+" else row['upstream_flanking'] + row['hit_length'],axis=1)

In [32]:
df['sstart'] = df['sstart'].apply(lambda x: max(x - flanking_value, 0))
df['send'] = df.apply(lambda row: min(row['send'] + flanking_value , row['slen']),axis=1)

In [33]:
df['tag'] = df.apply(lambda row: f">{row['sseqid']}:{row['sstart']}-{row['send']}({row['sign']})",axis=1)
df['reformated_tag'] = df['tag'].apply(lambda t: reformat(t))
df[['tag', 'reformated_tag', 'hit_start', 'hit_end']].to_csv('./Temp/hit_index_info.csv')#, index=False)

In [34]:
df['location_tag'] = df.apply(lambda row: f">{row['sseqid']}|{row['sign']}|{row['sstart'] + 1}-{row['send']}|{row['hit_start']+1}-{row['hit_end']}",axis=1)
df[['location_tag']].to_csv('./Temp/pipe_seprated_location_list.csv',index=False)

In [35]:
df[['sseqid','sstart','send','strand','ones', 'sign']].to_csv('./Temp/extension_index.bed', 
        index=False, header=False, sep="\t")

# Extention


In [36]:
# !sudo apt-get install bedtools

In [37]:
!bedtools getfasta -fi ./input_genome.fna -fo ./Temp/extended_original.txt -s -bed ./Temp/extension_index.bed
!rm input_genome.fna.fai

index file ./input_genome.fna.fai not found, generating...


In [34]:
# todo: remove duplicated
'''
df = fasta_to_df("./Temp/extended.txt")
df = df.drop_duplicates(subset=['tag'], keep='first')
df_to_fasta(df,"./Temp/extended.txt")
len(df['tag'].unique())
''';

# Convert hit region to upper case and other region to lower case

In [38]:
ext = fasta_to_df('./Temp/extended_original.txt')
info = pd.read_csv('./Temp/hit_index_info.csv')
info['tag'] = info['tag'].apply(lambda x: x[1:])
print(info.shape)
info.head(2)

(66445, 5)


Unnamed: 0.1,Unnamed: 0,tag,reformated_tag,hit_start,hit_end
0,132836,AMWY02059828.1:2832-3256(+),>AMWY020598281_2832-3256_+_,200,224
1,300170,AMWY02004761.1:1853-2277(+),>AMWY020047611_1853-2277_+_,200,224


In [39]:
ext = ext.sort_values(by=['tag']).reset_index()
ext['help_tag'] = ext.apply(lambda r: r['tag']+ str(r.name),axis=1)
del ext['tag']

info = info.sort_values(by=['tag']).reset_index()
info['help_tag'] = info.apply(lambda row: row['tag']+ str(row.name),axis=1)
def redefined_tag(row):
    tag = row['tag']
    [sstart, send] = tag.split(':')[-1].split('(')[0].split('-')
    sstart = int(sstart) + 1
    sign = tag.split('(')[-1].split(')')[0]    
    return f"{tag.split(':')[0]}|{sstart}-{send}|{row['hit_start']+1}-{row['hit_end']}|{sign}"
info['tag'] = info.apply(lambda row: redefined_tag(row),axis=1)
ext = pd.merge(ext,info,how='inner', on='help_tag')

def emphasis_hit(row):
    seq = list(row['data'].lower())            
    s = row['hit_start']
    e = row['hit_end']
    seq[s:e] = list(''.join(seq[s:e]).upper())    
    return ''.join(seq)
    
ext['data'] = ext.apply(lambda row: emphasis_hit(row),axis=1)
df_to_fasta(ext[['tag','data']],"./Temp/extended_modified.txt")

# Extended validation

In [None]:
df_blastn['hit'] = df_blastn['sseq'].apply(lambda x: x.replace('-', ''))
info = pd.read_csv('./Temp/hit_index_info.csv')
ext = fasta_to_df('./Temp/extended2.txt')

counter = 0
for index in df_blastn.index:
    hit = df_blastn['hit'][index]
    row = info[info['Unnamed: 0']== index].reset_index()
    tag = row['tag'][0][1:]
    hs = row['hit_start'][0]
    he = row['hit_end'][0]        
    tag = tag.replace('(+)',f"|{hs}-{he}(+)")
    tag = tag.replace('(-)',f"|{hs}-{he}(-)")            
    seq = ext[ext['tag']==tag]['data'].iloc[0]
    seq = ext[ext['tag']==tag]['data'].iloc[0]    
    if(seq[hs:he] != hit):
        print(tag, df_blastn['slen'][index])
        print(seq[hs:he])
        print(hit)
        print('\n\n')
        counter += 1                

In [178]:
df = pd.read_csv('check1.csv')
df.columns = ['tag', 'data']
def do(x):
    x = x[1:]
    x = x.split('|')
    s = int(x[2].split('-')[0]) - 1
    e = x[2].split('-')[1]
    return f"{x[0]}:{s}-{e}({x[1]})"
df['tag'] = df['tag'].apply(lambda x: do(x))
df.head(2)

Unnamed: 0,tag,data
0,AMWY02000003.1:4084-4506(+),ATACAATTGTCACATagtttacattattaatttccgcttaatttat...
1,AMWY02000003.1:4391-4810(+),ttctaattgcaaatttatgttatatttttaaatagaaaggGAGATT...


In [176]:
for t in df['tag']:
    check_seq = df[df['tag']==t]['data'].iloc[0]
    ext_seq = ext[ext['tag']==t]['data'].iloc[0]
    if(check_seq != ext_seq):
        print('error')        

In [167]:
ext = fasta_to_df('./Temp/extended.txt')
ext.head(2)

Unnamed: 0,tag,data
0,AMWY02059828.1:2832-3256(+),AAAGAATCAGCAATGGAAAAATAACCGGTTCTTAATTCAGcataac...
1,AMWY02004761.1:1853-2277(+),actaataatgCATGGCCATATATATCAAATCTACCATATgccattt...


# RNA 2d prediction

## Mfold

In [14]:
'''
# installation
!wget http://www.unafold.org/download/mfold-3.6.tar.gz
!tar -xvf ./mfold-3.6.tar.gz; rm ./mfold-3.6.tar.gz
%cd ./mfold-3.6
!./configure
!make
!make install
%cd ..
!sudo apt install texlive-font-utils
''';

In [15]:
#todo : add all hyperparameter(options) to GUI

In [59]:
counter = 0
base = "./secondary_structure/mfold/"
!rm -r {base}
!mkdir -p {base}
df = fasta_to_df('./Temp/extended_modified.txt')

for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)            
    with open(base + f"{tag}/SEQ.FASTA",'w') as file:
        file.write(f">{row['tag']}\n{row['data']}")
    counter += 1    
    if(counter >= 2000):
        break

In [60]:
%%capture
remove_lock = False
def run_mfold(tag):
    tag = reformat(tag)
    %cd {base + tag}
    !mfold  SEQ="SEQ.FASTA" T=20 MAX=2    
    if(not remove_lock):
        !find . -not -name "*.ct" -not -name "*.pdf" -not -name "*SEQ.FASTA" -not -type d -delete
    %cd ../../..

if __name__ == '__main__':        
    pool = mp.Pool(mp.cpu_count())  
    pool.map(run_mfold, df['tag'].iloc[:2000])

In [61]:
'''
base = "secondary_structure/mfold/"
for directory in glob.glob(f"{base}*"):    
    tag = directory[len(base):]
    ct_files = glob.glob(f'{directory}/*.ct')        
    try:
        ct_files.remove(f'{base}{tag}/SEQ.ct')
    except:
        print(directory)
        print(ct_files)
        print("*****************")
    for file in ct_files:        
        shutil.copy(file, './1.ct')
        #dot = ct2dot_bracket('./1.ct')
        #dot = dot.split('\n')
        #with open('./2.ct', 'w') as stream:
            #stream.write(bracket_to_ct(tag, dot[0] , dot[1] , "(0)"))        
        #ct1 = '\n'.join(reformatCT('./1.ct').split('\n')[1:])
        #ct2 = '\n'.join(reformatCT('./2.ct').split('\n')[1:])
        #if(ct1 != ct2):
            #print(file)
        ct = reformatCT('./1.ct')
        [nucleotide, index, values] = get_ct_data(ct)        
        #print(is_nested( index,  values))
        if(not is_nested( index,  values)):
            print("************")             
'''

## Mxfold2

In [57]:
#!wget https://github.com/keio-bioinformatics/mxfold2/releases/download/v0.1.1/mxfold2-0.1.1.tar.gz
#!pip3 install mxfold2-0.1.1.tar.gz
#!rm mxfold2-0.1.1.tar.gz

In [58]:
!mxfold2 predict ./extended.txt > secondary_structure/mxfold2_result.txt

>AMWY02059828.1:2832-3256(+)
AAAGAATCAGCAATGGAAAAATAACCGGTTCTTAATTCAGcataacaaattattcaattataatatagcTGTAAAAGAAATCTAAGTCTATTTGATATAGATCGGAATTTacgcaaattaaaaatttccaaataaGCAGTTCCGACCTGAGATCTGAACCGAAAACGCAAGATCCATCTAAACTCTCACCTCGGTCTCCGATTCAGTTGATGCAAGGCGGGATCCAATTCGCCTTTTCATTCAATTACATTCACCAATAACAGCTCGCCATCtggcttttaataaaaagttgcCAATCGGTTCCCGACCTGCACCAAGCGAATTAGAGACCGCCGGTAACTGAATCATTCTACATTAATCCCCGACTCCTCCTTTTACACATAGCAACTTCGCCCAAGAagactaaaaagaaaaggaagctAAC
........................((((...........((..........................))..........................................................................................................................(((...(((...............((((..........)))).................................(((....))).....................)))...)))..........................))))........................................................................................ (35.0)
>AMWY02004761.1:1853-2277(+)
actaataatgCATGGCCATATATATCAAATCTACCATATgccatttaataattttccttttttcttcttctttctttttctctct

>AMWY02002487.1:2516-2939(+)
tctctcttctttttcttccttaacTTTTCATCCAGCTTCAACCTCCATTTAGATCaaagttattgaatttttttttcatcttatttatgtaaatatatattgtttccTGCGGAAACGAATCCATGAACAACAGTCAATCAGTCATTGTTTGCTGATGCAGCGTCATCAAGATTCGCATGCTGATGGGTCGAGCAAAGCAGTGAGAATCTTGATGATGCTGCATCGGCCATAATTGACTATAtctcgtcatcatcatcatcatcatccagtTTCAACCTCCATGTAAATCaagttattgaattatttggtAAATAGATACTGATTCCCGCAGAATTGAATCAATGAACAACAGTCAATCAGTCATTGTTTGCTGATGCAGCATCATCAAGATTCACATGCGAATGGGTCGAACCAAAGCAGTGA
............................................................................................................((((....(....)............................((....)).(((...........))).........((.((.((...((..................)).))..)).)).................................................................................................))))........................................((....))....................((....((......))...))..... (30.2)
>AMWY02089812.1:0-419(-)
TCtgtctatatttattttcttctcattcACTGTAGTAATTTAAGCCTATACAGTTCTGAGTTGACCNatttctttatataaagtTNATTTC

>AMWY02001968.1:1243-1665(+)
CTACCTAAACTCCATGCATGGCTCGTGCTAGCTTTctggtttcttcttttttctttaagggcttattataaatttgcaGCAAGCCTAAACCCTTCTTAATTTCAAGATCTCTCTTcatttgattctttctttctttttctagggattcttcttcttcttcttcttcttgtttgctGCTGGTGTATGTTGGTTTGAGAGATTGAAGCTGCCAGCATGATCTGGTAATATGGAAcctaatattatacatatacatatctatatctatatatatagatagatagttttagatttactctttaattatattaattcctCCACCTATAGTTTTAGAGttactctttaattatattaattcttccACCTCTTTGTATAGATAGATAGAAACATATAGAAGGTCTTAGATTTCCTTTGCTTTTAGATCC
...((....(.((..((...((....))...............................(((....................)))..........................................................................................)).)).)......))........................................................................................................................................................................................................................................ (27.0)
>AMWY02039981.1:1214-1636(-)
atttatgttttctattttataattaaaaaataaaaaaaaaataataaggtaatctctctctctttttcttttatattcatatgaGGTCT

In [143]:
df = fasta_to_df('secondary_structure/mxfold2_result.txt')
df = df.apply(lambda row: bracket_row(row) , axis=1)
df.head(2)

Unnamed: 0,tag,data,bracket
0,AMWY02059828.1:2832-3256(+),AAAGAATCAGCAATGGAAAAATAACCGGTTCTTAATTCAGcataac...,........................((((...........((........
1,AMWY02004761.1:1853-2277(+),actaataatgCATGGCCATATATATCAAATCTACCATATgccattt...,.................................................


In [145]:
base = "./secondary_structure/mxfold2/"
!rm -r {base}
!mkdir -p {base}
for index, row in df.iterrows():    
    if(not os.path.exists(base + reformat(row['tag']))):
        os.makedirs(base + reformat(row['tag']))        
    tag = reformat(row['tag'])
    with open(base + f"{tag}/{tag}.ct",'w') as file:
        bracket = row['bracket'].split(' ')[0]
        deltaG = row['bracket'].split(' ')[1]
        ct = bracket_to_ct(row['tag'], row['data'], bracket, deltaG)
        file.write(ct)    

## SPOT-RNA

In [8]:
#!git clone https://github.com/jaswindersingh2/SPOT-RNA.git
#%cd SPOT-RNA
#!wget 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz' || wget -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1'
#!tar -xvzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz
#!sudo apt-get install python3.6
#!python3.6 -m pip install tensorflow==1.14.0 # or for gpu: tensorflow-gpu==1.14.0
#! python3.6 -m pip install -r requirements.txt

In [23]:
base = "./secondary_structure/spot_rna/"
!rm -r {base}
!mkdir -p {base}

In [29]:
!python3.6 ./SPOT-RNA/SPOT-RNA.py  --inputs ./extended.txt  --outputs '{base}'  --cpu 32 --plots True

>> Opening FASTA file...
>> Converting FASTA file from multiline to single line and writing to file.
>> Done!

Preparing tfr records file for SPOT-RNA:
100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.21s/it]

Predicting for SPOT-RNA model 0
100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.23s/it]

Predicting for SPOT-RNA model 1
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.51s/it]

Predicting for SPOT-RNA model 2
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.10s/it]

Predicting for SPOT-RNA model 3
100%|█████████████████████████████████████████████| 1/1 [00:16<00:00, 16.44s/it]

Predicting for SPOT-RNA model 4
100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.18s/it]

Post Processing and Saving Output

Finished!

Processsing Time 203.32813096046448 seconds


In [48]:
!rm {base}/*.bpseq
!rm {base}/*.prob
for file in glob.glob(f"{base}*.ct"):    
    f = file[len(base):-3] # .ct        
    f = reformat(f)        
    if(not os.path.exists(base + f)):
        os.makedirs(base + f)  
    header = reformatCT(file).split("\n")[0]    
    with open(f"{base}{f}.dot", 'w') as stream:        
        stream.write(ct2dot_bracket(file))
    !RNAeval "{base}{f}.dot" -T 20 -v 
    #shutil.move(file, f"{base}{f}/{f}.ct")    

rm: cannot remove './RNA_secondary_structure/spot_rna//*.bpseq': No such file or directory
rm: cannot remove './RNA_secondary_structure/spot_rna//*.prob': No such file or directory
[36mExternal loop[0m                           : [32m -371[0m
[36mInterior loop[0m (  3, 32) [1mAU[0m; (  4, 31) [1mGC[0m: [32m -256[0m
[36mInterior loop[0m (  4, 31) [1mGC[0m; (  5, 30) [1mAU[0m: [32m -294[0m
[36mInterior loop[0m (  5, 30) [1mAU[0m; (  6, 29) [1mAU[0m: [32m -122[0m
[36mHairpin  loop[0m (  6, 29) [1mAU[0m              : [32m  716[0m
[36mInterior loop[0m ( 38, 71) [1mCG[0m; ( 39, 70) [1mAU[0m: [32m -255[0m
[36mInterior loop[0m ( 39, 70) [1mAU[0m; ( 40, 69) [1mGC[0m: [32m -256[0m
[36mInterior loop[0m ( 40, 69) [1mGC[0m; ( 41, 68) [1mCG[0m: [32m -403[0m
[36mInterior loop[0m ( 41, 68) [1mCG[0m; ( 49, 64) [1mAU[0m: [32m  618[0m
[36mInterior loop[0m ( 49, 64) [1mAU[0m; ( 50, 63) [1mUA[0m: [32m -155[0m
[36mInterior loop[0m 

In [68]:
df = fasta_to_df('./secondary_structure/spot_rna/AMWY02059828.1:2832-3256(+).dot')
df = df.apply(lambda row: bracket_row(row) , axis=1)
bracket = df['bracket'][0].split(' ')[0]
ct = bracket_to_ct(df['tag'][0], df['data'][0], bracket, "(0)")
print(ct)

   424 dG =       0.0 AMWY02059828.1:2832-3256(+)
     1 A      0      2      0       1
     2 A      1      3      0       2
     3 A      2      4     32       3
     4 G      3      5     31       4
     5 A      4      6     30       5
     6 A      5      7     29       6
     7 U      6      8      0       7
     8 C      7      9      0       8
     9 A      8     10      0       9
    10 G      9     11      0      10
    11 C     10     12      0      11
    12 A     11     13      0      12
    13 A     12     14      0      13
    14 U     13     15      0      14
    15 G     14     16      0      15
    16 G     15     17      0      16
    17 A     16     18      0      17
    18 A     17     19      0      18
    19 A     18     20      0      19
    20 A     19     21      0      20
    21 A     20     22      0      21
    22 U     21     23      0      22
    23 A     22     24      0      23
    24 A     23     25      0      24
    25 C     24     26      0      25


## Vienna package

In [92]:
#!wget https://www.tbi.univie.ac.at/RNA/download/ubuntu/ubuntu_20_04/viennarna_2.4.18-1_amd64.deb -O viennarna.deb
#!sudo dpkg -i ./viennarna.deb
#!sudo apt-get -f install
#!rm viennarna.deb

In [131]:
base = "./secondary_structure/viennarna/"
!rm -r {base}
!rm ./secondary_structure/viennarna_result.txt
!mkdir -p {base}

rm: cannot remove './secondary_structure/viennarna_result.txt': No such file or directory


In [132]:
%cd {base}
!RNAfold --jobs=0 --infile ../../Temp/extended_modified.txt  --noPS -T 20 > ../viennarna_result.txt
%cd ../../

/home/jupyter/plant_microRNA_prediction/secondary_structure/viennarna
/home/jupyter/plant_microRNA_prediction


In [95]:
df = fasta_to_df('secondary_structure/viennarna_result.txt')
df = df.apply(lambda row: bracket_row(row) , axis=1)
print(df.shape)
df.head(2)

(21225, 3)


Unnamed: 0,tag,data,bracket
0,AMWY02000003.1:4084-4506|200-222(+),auacaauugucacauaguuuacauuauuaauuuccgcuuaauuuau...,....((((((...))))))................((((((...((...
1,AMWY02000003.1:4391-4810|200-219(+),uucuaauugcaaauuuauguuauauuuuuaaauagaaagggagauu...,.....((.(((......))).))......(((((((.(((((((.....


In [125]:
for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)      
    with open(base + f"{tag}/{tag}.ct",'w') as file:
        bracket = row['bracket'].split(' ')[0]
        deltaG = row['bracket'].split(' ')[1]
        ct = bracket_to_ct(row['tag'], row['data'], bracket, deltaG, False)
        file.write(ct)    

In [126]:
import glob
for file in glob.glob(f"{base}*.ps"):    
    f = file[len(base):-6] # _ss.ps 
    f = reformat(f)        
    shutil.move(file, f"{base}{f}/{f}.ps")    

## ContraFold

In [39]:
#!wget http://contra.stanford.edu/contrafold/contrafold_v2_02.tar.gz
#!tar -xvzf contrafold_v2_02.tar.gz && rm contrafold_v2_02.tar.gz
#%cd contrafold/src
#!make clean
#!make 
# to file must changed to be complieable # utility.hpp and optimization.c++ files

In [42]:
counter = 0
base = "./secondary_structure/contrafold/"
!rm -r {base}
!mkdir -p {base}
df = fasta_to_df('./Temp/extended.txt')

for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)            
    with open(base + f"{tag}/{tag}.FASTA",'w') as file:
        file.write(f">{row['tag']}\n{row['data']}")
    counter += 1    
    if(counter >= 10):
        break

In [43]:
def run_contrafold(tag):
    tag = reformat(tag)    
    %cd contrafold/src
    !./contrafold predict ../..{base[1:]}{tag}/{tag}.FASTA > ../..{base[1:]}{tag}/{tag}.dot
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'r') as file:
        text = file.read()
    text = [l for l in text.split("\n") if l[:len(">structure")] != ">structure"]    
    header = text[0]
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'w') as file:
        file.write('\n'.join(text[1:]))    
    !RNAeval  ../..{base[1:]}{tag}/{tag}.dot -T 20 > ../..{base[1:]}{tag}/{tag}.dotdg    
    with open(f"../..{base[1:]}{tag}/{tag}.dotdg", 'r') as file:
        text = file.read()
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'w') as file:
        file.write(header + "\n" + text)    
    
    df = fasta_to_df(f'../..{base[1:]}{tag}/{tag}.dot')
    df = df.apply(lambda row: bracket_row(row) , axis=1)        
    tag = reformat(df['tag'][0])
    with open(f'../..{base[1:]}{tag}/{tag}.ct','w') as file:
        bracket = df['bracket'][0].split(' ')[0]        
        deltaG = df['bracket'][0].split(' ')[1]
        ct = bracket_to_ct(df['tag'][0], df['data'][0], bracket, deltaG, False)
        file.write(ct)    
    #!rm ../..{base[1:]}{tag}/{tag}.dot
    #!rm ../..{base[1:]}{tag}/{tag}.dotdg
    !rm ../..{base[1:]}{tag}/{tag}.FASTA
    %cd ../../        

if __name__ == '__main__':        
    pool = mp.Pool(mp.cpu_count() - 1)  
    pool.map(run_contrafold, df['tag'].iloc[:10])

In [56]:
s = 'CUCCCCUUGUCUACCAUCCCCAACUAGCGAGAGAGACAUUACCUACCUGAAUAGAAGAUCUCUCUCGAGCUCUCGagcucucucuuuuucuauaUCUCUGUCUCUUUGUGUCUCUGGAGCUUGUACUAACAUUAAUAUCGUGCACCAGCAGCAGUUGAAGCUGCCAGCAUGAUCUAAACUUCCUUCUCUGUAAAGGAUAGAUCGGAUCAUGUGGUAGCUUCACCUGUUGAUGGGAUCACGAAAGCGCCCCUCUUACUACUCUACAUUAAUUCUUUCUCGUUAUACAACCUCCCAGUAAGCAUGCUUUCAAAACCAACUUGAGuaaguuaauuuguuuagcuuuuguuuuuggcucuuccuuuacuuuaaauuuucucaucuggguuuuuguuauauauauguacuguuuuauauauguauuccu'
d = '............................((((((((..(...(((......))).)..))))))))(((((....)))))...................................((((.((((...(((.......(((((..(((.((((((.((((((((((.(((((((((.(.(((((((.......))))).)).).))))))))))))))))))).)))))).)))...))))).....................................)))..)))).))))....((((()))))..((((((....((((.(((((((.....)))(((.........)))................)))).))))....))))))....(((((((((((......)))))))))))....'
print(s[300],s[301])
print(d[300],d[301])

A U
( )


In [55]:
'''path = 'secondary_structure/contrafold/AMWY020333941_469-893_-_/AMWY020333941_469-893_-_.dot'
!RNAeval  {path} -T 20 -v'''; 

[36mExternal loop[0m                           : [32m -364[0m
[36mInterior loop[0m ( 29, 66) [1mGC[0m; ( 30, 65) [1mAU[0m: [32m -294[0m
[36mInterior loop[0m ( 30, 65) [1mAU[0m; ( 31, 64) [1mGC[0m: [32m -256[0m
[36mInterior loop[0m ( 31, 64) [1mGC[0m; ( 32, 63) [1mAU[0m: [32m -294[0m
[36mInterior loop[0m ( 32, 63) [1mAU[0m; ( 33, 62) [1mGC[0m: [32m -256[0m
[36mInterior loop[0m ( 33, 62) [1mGC[0m; ( 34, 61) [1mAU[0m: [32m -294[0m
[36mInterior loop[0m ( 34, 61) [1mAU[0m; ( 35, 60) [1mGC[0m: [32m -256[0m
[36mInterior loop[0m ( 35, 60) [1mGC[0m; ( 36, 59) [1mAU[0m: [32m -294[0m
[36mInterior loop[0m ( 36, 59) [1mAU[0m; ( 39, 56) [1mUA[0m: [32m  103[0m
[36mInterior loop[0m ( 39, 56) [1mUA[0m; ( 43, 54) [1mCG[0m: [32m  305[0m
[36mInterior loop[0m ( 43, 54) [1mCG[0m; ( 44, 53) [1mUA[0m: [32m -256[0m
[36mInterior loop[0m ( 44, 53) [1mUA[0m; ( 45, 52) [1mAU[0m: [32m -165[0m
[36mHairpin  loop[

# Visualization

In [25]:
#https://github.com/ViennaRNA/forna
#http://varna.lri.fr/

# CT Analizer

In [14]:
# only select those not ran before
base = "./secondary_structure/mfold/"
df = fasta_to_df('./Temp/extended_modified.txt')

index_list =[]
for index, row in df.iterrows():    
    tag = reformat(row['tag'])    
    if(len(glob.glob(f'{base + tag}/*.ct')) != 0):
        index_list.append(index)
df = df.iloc[index_list,:]
print(df.shape)

(2000, 2)


In [15]:
def get_tag_info(tag):
    data = tag.split('|')
    hit_start = int(data[2].split('-')[0]) - 1 
    hit_end = int(data[2].split('-')[1])
    sign = data[3]
    return [hit_start, hit_end, sign]

In [16]:
def get_deltaG(ct):
    ct_head = ct.split('\n')[0]
    if("dG = " in ct_head):    
        dG_patter = "dG = " 
    elif("dG= " in ct_head):    
        dG_patter = "dG= "
    elif("dG=" in ct_head):    
        dG_patter = "dG="
    elif("dG =" in ct_head):    
        dG_patter = "dG ="
    else:
        print('there is no dG')
    return float(ct_head.split(dG_patter)[-1].split(' ')[0])

In [17]:
def get_complementarity_in_hit_region(inc_srange, hit_len):    
    if(sum(inc_srange == 0) == hit_len):
        return "no"
    elif(sum(inc_srange != 0) == hit_len):
        return "fully_connected"
    else:    
        return "yes"    

In [18]:
def get_hit_self_complementarity(hit_start, hit_end, inc_srange):    
    if(((inc_srange <= hit_start) | (inc_srange > hit_end)).all()):
        return "no"
    return "yes"

In [19]:
def get_istar_min_max(inc_srange, hit_self_complementarity):
    nonzero_data_srange  = inc_srange[inc_srange!=0]
    if(hit_self_complementarity == 'yes'):
        return [np.nan, np.nan]
    return [nonzero_data_srange.min(), nonzero_data_srange.max()]

In [20]:
def get_continuous_pairing(hit_start, hit_end, istar_min, istar_max, hit_self_complementarity):    
    if(hit_self_complementarity == 'yes'):
        return "undifined"
    if(hit_end < istar_max and (hit_start+1) > istar_min):
        return "no"    
    return  "yes"

In [21]:
def get_mir_type(hit_start, hit_end, istar_min, istar_max, continuous_pairing, complementarity_in_hit_region, hit_self_complementarity):        
    if(continuous_pairing == "yes" and complementarity_in_hit_region != "no" and hit_self_complementarity == "no"):
        if( hit_end < istar_min):
            return "5p" 
        if( (hit_start+1) > istar_max):
            return "3p"     
    else:
        if(continuous_pairing == "no" and hit_self_complementarity == "yes"):
            return "discontinuous star strand and hit self complementarity"
        elif(continuous_pairing == "no"):
            return "discontinuous star strand"
        elif(hit_self_complementarity == "yes"):
            return "hit self complementarity"

    if(complementarity_in_hit_region == "no"):
        return "no complementarity in hit region"  
    print(hit_start, hit_end, istar_min, istar_max, continuous_pairing, complementarity_in_hit_region, hit_self_complementarity)

In [22]:
def get_star_start(hit_start, hit_end, values):
    c = 0
    i = hit_end - 3 - c
    while(values[i] == 0 and i >= 0):
        c += 1
        i = hit_end - 3 - c        
    if(values[i] - c < 1):                    
        return [max(values[i] - c,1), "negative value"]
    if(i < hit_start):
        return [values[i] - c, 'less than hit start']
    return [values[i] - c, '']

In [33]:
def get_star_end(hit_start, hit_end, values):
    if(hit_start - 2 >= 0 ):
        a = 0    
    else:
        a = abs(hit_start - 2)
    
    i = hit_start - 2 + a
    while(values[i] == 0 and i <= hit_end):
        a += 1
        i = hit_start - 2 + a
    
    if(i <= hit_end):        
        if((values[i] + a) > len(values)):
            return [len(values), "out of sequance range"]        
        return [values[i] + a, ""]
    return [np.nan, "some error happened"]

In [34]:
def get_num_of_linking_residues(hit_start,hit_end, star_start, star_end, mir_type):
    if(mir_type == '5p'):
        return str(star_start - hit_end - 1)
    elif(mir_type == '3p'):
        return str(hit_start - star_end)
    elif(mir_type == "discontinuous star strand"):
        return "discontinuous star strand"
    elif(mir_type == "no complementarity in hit region"):
        print('error')    

In [35]:
def get_star_branching(star_start, star_end, star_range, values):    
    return not ((values[star_range-1] < star_start) | (values[star_range-1] > star_end)).all()

In [36]:
def getBOI_5p(hit_start, hit_end, values):
    # first calc latest non zero value
    for i in range(hit_end-1, 0, -1):
        if(values[i] != 0):
            last_v = values[i]
            last_i = i
            place = i
            break            
            
    for i in range(place-1, 0, -1):        
        v = values[i]
        if(v == 0):
            continue
        if(v < last_v):
            return [last_i + 1, last_v]                                                
        
        if((v - last_v) >= 3):
            s1 = set(range(last_v+1, v))
            s2 = set([values[i-1] for i in range(last_v+1, v)])
            if(len(s1.intersection(s2)) > 0):
                return [last_i + 1, last_v]    
        last_v = v            
        last_i = i            
    for i in range(0,hit_end):
        if(values[i] != 0 ):
            return [i + 1, values[i]]    
    print("Error")
                
def getBOI_3p(hit_start, hit_end, values):
    # first calc latest non zero value
    for i in range(hit_start, len(values)):    
        if(values[i] != 0):
            last_v = values[i]
            last_i = i
            place = i
            break            
            
    for i in range(place + 1, len(values)):
        v = values[i]
        if(v == 0):
            continue
        if(v > last_v):
            return [last_v, last_i + 1]                                                
        
        if((last_v - v) >= 3):
            s1 = set(range(v+1, last_v))
            s2 = set([values[i-1] for i in range(v+1, last_v)])
            if(len(s1.intersection(s2)) > 0):
                return [last_v, last_i + 1]    
        last_v = v            
        last_i = i            
    for i in range(hit_start, len(values)):
        if(values[i] != 0 ):
            return [values[i], i + 1]    
    print("Error")            
    
    
def get_boi(hit_start, hit_end, values, mir_type):
    if(mir_type not in ['3p','5p']):
        return [np.nan, np.nan]
    if(mir_type == '5p'):
        return getBOI_5p(hit_start, hit_end, values)
    if(mir_type == '3p'):
        return getBOI_3p(hit_start, hit_end, values)

In [37]:
def get_terminal_structure_range(hit_start, hit_end, star_start, star_end, mir_type):
    if(mir_type == '5p'):
        return [i for i in range(hit_end, star_start-1)]
    if(mir_type == '3p'):
        return [i for i in range(star_end, hit_start)]
    print("Error in get_terminal_structure_range function")        

In [47]:
def get_number_of_terminal_structure(values, terminal_structure_range):    
    data = values[terminal_structure_range]
    data = data[data != 0].to_numpy()
    if(len(data) == 0):
        return 0           
    counter = 1            
    last = data[0]         
    for i in range(1,len(data)): 
        if(data[i] > last):
            counter += 1
        last = data[i]        
    return counter 

In [133]:
def get_branch_star_end_point(values, terminal_structure_range, pdf):        
    data = values[terminal_structure_range]    
    index = np.array(terminal_structure_range)[data != 0]    
    data = data[data != 0].to_numpy()                        
    branch_start_index = []
    branch_end_index = []
    branch_start_index.append(index[0])
    last = data[0]            
    for i in range(1,len(data)): 
        if(data[i] > last):
            branch_end_index.append(index[i-1])
            branch_start_index.append(index[i])
        last = data[i]        
    branch_end_index.append(index[-1])
    #
    branch_start_point = []
    branch_end_point = []
    for i in range(0, len(branch_start_index)):         
        i_s = branch_start_index[i]
        i_e = branch_end_index[i]
        v_s = values[i_s]
        v_e = values[i_e]
        if(v_s > i_s and v_s <= (i_e + 1)):
            branch_start_point.append(i_s + 1)
            branch_end_point.append(v_s)
        elif(v_e > i_s and v_e <= (i_e + 1)):
            branch_start_point.append(v_e)
            branch_end_point.append(i_e + 1)
        else:
            print(pdf)
    return [branch_start_point, branch_end_point]

In [134]:
server_url = "http://jupyter.sysmanager.ir/tree/plant_microRNA_prediction"
def get_row(tag, path, extra = 15, acceptable_terminal_structures = 5):
    #print(path)
    result = {}    
    ct = reformatCT(path)
    result['seq name'] = tag
    result['ct name'] = ""
    result['ct'] = f'=HYPERLINK("{server_url + path[1:]},","ct")'
    result['pdf'] = f'=HYPERLINK("{server_url + path[1:-3] + ".pdf"}","pdf")'        
    [hit_start, hit_end, sign] = get_tag_info(tag)
    result['hit_start'] = hit_start + 1
    result['hit_end'] =  hit_end
    result['sign'] = sign
    dg = get_deltaG(ct)
    result['delta G'] = dg
    [nucleotide, index, values] = get_ct_data(ct)
    hit_seq = ''.join(nucleotide[hit_start:hit_end])
    result['hit_seq'] = hit_seq
    hit_range = index[hit_start:hit_end]
    hit_len = len(hit_range)
    result['hit_len'] = hit_len
    inc_srange = values[hit_start:hit_end] # Incomplete_Star_range

    complementarity_in_hit_region = get_complementarity_in_hit_region(inc_srange, hit_len)
    result['complementarity_in_hit_region'] = complementarity_in_hit_region    
    if(complementarity_in_hit_region == "no"):
        msg = "no complementarity in hit region"
        result['hit_self_complementarity'] = msg
        result['continuous_pairing'] = msg
        result['mir_type'] = msg
        result['star_start'] = msg
        result['star_end'] = msg
        result['star_seq'] = msg
        result['num_of_linking_residues'] = msg
        result['star_branching'] = msg
        result['boi_start']  = msg
        result['boi_end'] = msg
        return pd.Series(result) 
    
    hit_self_complementarity = get_hit_self_complementarity(hit_start, hit_end, inc_srange)    
    result['hit_self_complementarity'] = hit_self_complementarity       
    if(hit_self_complementarity == "yes"):
        msg = "hit self complementarity"
        result['continuous_pairing'] = msg
        result['mir_type'] = msg
        result['star_start'] = msg
        result['star_end'] = msg
        result['star_seq'] = msg
        result['num_of_linking_residues'] = msg
        result['star_branching'] = msg
        result['boi_start']  = msg
        result['boi_end'] = msg
        return pd.Series(result) 
    # 
    if(hit_start - extra < 0 or (len(values) - hit_end) < extra):
        msg = "Not enough flanking for hit region"        
        result['mir_type'] = msg
        result['star_start'] = msg
        result['star_end'] = msg
        result['star_seq'] = msg
        result['num_of_linking_residues'] = msg
        result['star_branching'] = msg
        result['boi_start']  = msg
        result['boi_end'] = msg
        return pd.Series(result) 
    
    [flanking_istar_min, flanking_istar_max] = get_istar_min_max(values[(hit_start-extra):(hit_end+extra)], hit_self_complementarity)  
    result['flanking_istar_min']  = flanking_istar_min
    result['flanking_istar_max']  = flanking_istar_max    
    
    continuous_pairing = get_continuous_pairing(hit_start, hit_end, flanking_istar_min, flanking_istar_max, hit_self_complementarity)
    result['continuous_pairing'] = continuous_pairing    
    if(continuous_pairing == "no"):
        msg = "discontinuous star strand"        
        result['mir_type'] = msg
        result['star_start'] = msg
        result['star_end'] = msg
        result['star_seq'] = msg
        result['num_of_linking_residues'] = msg
        result['star_branching'] = msg
        result['boi_start']  = msg
        result['boi_end'] = msg
        return pd.Series(result) 
    
    [istar_min, istar_max] = get_istar_min_max(inc_srange, hit_self_complementarity)  
    result['istar_min']  = istar_min
    result['istar_max']  = istar_max
    
    mir_type = get_mir_type(hit_start, hit_end, istar_min, istar_max, continuous_pairing, complementarity_in_hit_region, hit_self_complementarity)
    result['mir_type'] = mir_type    
    if(mir_type not in ['3p', '5p']):
        msg = mir_type        
        result['star_start'] = msg
        result['star_end'] = msg
        result['star_seq'] = msg
        result['num_of_linking_residues'] = msg
        result['star_branching'] = msg
        result['boi_start']  = msg
        result['boi_end'] = msg
        return pd.Series(result) 
    
    [star_start, star_start_msg] = get_star_start(hit_start, hit_end, values)
    [star_end, star_end_msg] = get_star_end(hit_start, hit_end, values)
    result['star_start'] = star_start 
    result['star_start_msg'] = star_start_msg     
    result['star_end'] = star_end    
    result['star_end_msg'] =  star_end_msg
    
    star_range = index[star_start - 1:star_end]
    star_seq = ''.join(nucleotide[star_start - 1:star_end])
    result['star_seq'] = star_seq
    num_of_linking_residues = get_num_of_linking_residues(hit_start,hit_end, star_start, star_end, mir_type)
    result['num_of_linking_residues'] = num_of_linking_residues
    star_branching = get_star_branching(star_start, star_end, star_range, values)
    result['star_branching'] = "yes" if star_branching else "no"
    [boi_start, boi_end] = get_boi(hit_start, hit_end, values, mir_type)
    boi_seq = ''.join(nucleotide[boi_start-1: boi_end].tolist())
    result['boi_start'] = boi_start
    result['boi_end'] =  boi_end
    result['boi_seq'] =  boi_seq
    result['terminal_structure_range'] = boi_seq
    terminal_structure_range = get_terminal_structure_range(hit_start, hit_end, star_start, star_end, mir_type)
    result['terminal_structure_range'] = [i+1 for i in terminal_structure_range]    
    if(num_of_linking_residues == 0):        
        result['number_of_terminal_structure'] = "no residues between miR and miR*" 
    else:
        number_of_terminal_structure = get_number_of_terminal_structure(values, terminal_structure_range)
        if(number_of_terminal_structure == 0):
            result['number_of_terminal_structure'] = "one loop without stem"
        elif(number_of_terminal_structure == 1):
            result['number_of_terminal_structure'] = "todo: ????????????????????????????????????????"
        else:            
            result['number_of_terminal_structure'] = number_of_terminal_structure
            [branch_start_point, branch_end_point]  = get_branch_star_end_point(values, terminal_structure_range,result['pdf'])
            for i in range(acceptable_terminal_structures):
                if(i < len(branch_start_point)):
                    result[f'branch#{i + 1}_start_point'] = branch_start_point[i]
                    result[f'branch#{i + 1}_end_point'] = branch_end_point[i]
                    result[f'branch#{i + 1}_length'] = abs(branch_end_point[i] - branch_start_point[i]) + 1
                else:
                    result[f'branch#{i + 1}_start_point'] = ""
                    result[f'branch#{i + 1}_end_point'] = ""            
                    result[f'branch#{i + 1}_length'] = ""
    return pd.Series(result)

In [135]:
def get_df_by_tag(tag):
    ct_files = glob.glob(f'{base}{reformat(tag)}/SEQ_*.ct')
    return pd.Series(ct_files).apply(lambda path: get_row(tag, path))
#get_df_by_tag(df['tag'].iloc[3])
get_df_by_tag("AMWY02000194.1|1048-1466|201-219|-")['terminal_structure_range'][0]

[220, 221, 222, 223, 224, 225]

In [136]:
def get_df_by_tag(tag):
    ct_files = glob.glob(f'{base}{reformat(tag)}/SEQ_*.ct')
    return pd.Series(ct_files).apply(lambda path: get_row(tag, path))
dfs = []
for d in process_map(get_df_by_tag , df['tag'], tqdm_class=tqdm, max_workers=mp.cpu_count()- 1, chunksize=5):
    dfs.append(d)
df_result = pd.concat(dfs,axis=0)
df_result.to_csv("ct_analizer_result.csv", index=False)
!zip -r ct_analizer_result.csv.zip ./ct_analizer_result.csv

  0%|          | 0/2000 [00:00<?, ?it/s]

updating: ct_analizer_result.csv (deflated 93%)


# BLASTX or DIMOND

# DIAMOND

https://github.com/bbuchfink/diamond

In [None]:
'''
import hashlib
import os
parallel = [os.path.join(dp, f) for dp, dn, filenames in os.walk("./PRNA_secondary_structure") for f in filenames ]
series = [os.path.join(dp, f) for dp, dn, filenames in os.walk("./SRNA_secondary_structure") for f in filenames ]
for i in range(len(parallel)):        
    md5_hash = hashlib.md5()
    with open(parallel[i],"rb") as file:        
        md5_hash.update(file.read())
        digest1 = md5_hash.hexdigest()                
    md5_hash = hashlib.md5()
    with open(series[i],"rb") as file:        
        md5_hash.update(file.read())
        digest2 = md5_hash.hexdigest()                
    if(digest1 != digest2):
        print(parallel[i])
        print(series[i])
        print("***********")     
'''