# Config

In [1]:
experiment = "A.indica"
input_genome_name = "GCA_022749755.1_ASM2274975v1_genomic.fna"

experiment_dir = "../experiment"

ss_method = "mxfold2"

In [2]:
input_genome_path = f'{experiment_dir}/{experiment}/{input_genome_name}'

temp_path = f"{experiment_dir}/{experiment}/Temp"
result_path = f"{experiment_dir}/{experiment}/Result"

temp_path_f = temp_path.replace(" ", "\ ")
result_path_f = result_path.replace(" ", "\ ")

In [3]:
classifier_threshold = 0.99

# Common

In [4]:
#!pip install tqdm

In [5]:
import json
import time
import pickle
from subprocess import Popen, PIPE, STDOUT
import math
import numpy as np
import pandas as pd
import hashlib
import requests
import os, sys, subprocess
from tqdm.contrib.concurrent import process_map
from tqdm.notebook import tqdm
tqdm.pandas()
import multiprocessing as mp
import shutil
import urllib.parse
import glob
import os
import sys
import networkx
from networkx.algorithms.clique import find_cliques as maximal_cliques
from ast import literal_eval
from keras.models import load_model
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
sys.path.append("../src/")
from ct_analizer import get_row
from filter import filter_run
from postprocess import postprocess
from convertor import convert
from preprocessing import get_target, preprocessing

In [6]:
if(not os.path.exists(temp_path)):
    os.mkdir(temp_path)
    
if(not os.path.exists(result_path)):
    os.mkdir(result_path)

In [7]:
current_path = os.getcwd()

In [8]:
def bracket_row(row):    
    s = row['data']
    index = min(s.find('.'), s.find('('))
    data = row['data']
    row['data'] = data[0:index]
    row['bracket'] = data[index:]
    return row

In [9]:
def adjust(text,n=7):
    text = str(text)    
    return " " * (n - len(text)) + text

In [10]:
def bracket_to_ct(tag, data, bracket, deltaG, negative_deltaG=True, convert_u=True):        
    if(convert_u):            
        data = data.replace("u", "t").replace("U", "T")
    deltaG = deltaG.replace('(','').replace(')','')
    deltaG = float(deltaG)
    if(deltaG > 0 and negative_deltaG ): # negetive?!
        deltaG = -1 * deltaG
    stack = []
    index = np.zeros((len(bracket)), dtype = int)
    values = np.zeros((len(bracket)), dtype = int)    
    for i in range(len(bracket)):
        index[i] = i + 1
        if(bracket[i] == '.'):
            values[i] = 0
        elif(bracket[i] == '('):
            stack.append(i)
        elif(bracket[i] == ')'):
            if(len(stack) == 0 ):
                print('structure error!')
            values[stack[-1]] = i + 1
            values[i]  = stack[-1] + 1
            stack.pop()
        else:
            print('structure error!')
    if(len(stack) != 0 ):
        print('structure error!')
    # body    
    ct = f"{adjust(len(data),6)} dG ={adjust(deltaG,10)} {tag}\n"   
    for i in range(len(bracket)):
        ct += f"{adjust(index[i],6)} {data[i]} {adjust(i,6)} {adjust((i+2)%(len(data)+1),6)} {adjust(values[i],6)} {adjust(index[i],7)}\n"
    return ct

In [11]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [12]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [13]:
def reformat(path):
    return path.replace('(','_').replace(')','_').replace('.','').replace(':','_')

In [14]:
def reformatCT(path):
    with open(path, 'r') as file:
        text = file.read()
    text = [l for l in text.split('\n') if len(l) > 0 ] # remove blank lines
    text = '\n'.join(text)
    text = text.replace("\t"," ")
    while("  " in text):
        text = text.replace("  ", " ")
    lines = [l for l in text.split('\n')]
    for i in range(len(lines)):
        if(lines[i][0] == " "):
            lines[i] = lines[i][1:]
        if(lines[i][-1] == " "):
            lines[i] = lines[i][:-1]
    text = '\n'.join(lines)
    return text

In [15]:
def get_ct_data(ct):
    ct = "\n".join(ct.split('\n')[1:])
    df = pd.read_csv(StringIO(ct), sep=" ", header=None)               
    nucleotide = df.iloc[:,1]
    index = df.iloc[:,5]
    values = df.iloc[:,4]
    return [nucleotide, index, values]

In [16]:
def ct2dot_bracket(path):
    [nucleotide, index, values] = get_ct_data(reformatCT(path))
    text = ''.join(nucleotide) + "\n"
    watch = []
    for i, v in zip(index,values):
        if(v == 0):
            text += '.'
        else:
            if( v not in watch):
                text += '('
                watch.append(i)
            if( v in watch):
                text += ')'
    return text

In [17]:
def is_nested(index, values):
    max_value = max(index) + 10 # inf
    for i, v in zip(index, values):
        if(v < max_value and v != 0):
            max_value  = v
        if(i >= max_value):
            max_value = max(index) + 10 # inf
        if(v > max_value):
            return False               
    return True

# Download data from Mirbase

In [19]:
directory = './miRBase'
base = "https://www.mirbase.org/ftp/CURRENT"        

In [18]:
!rm -r {directory}
!mkdir -p {directory}

!wget {base}/aliases.txt.gz -P ./{directory}/       ; gzip -d ./{directory}/aliases.txt.gz 
!wget {base}/hairpin.fa.gz -P ./{directory}/           ; gzip -d ./{directory}/hairpin.fa.gz 
!wget {base}/hairpin_high_conf.fa.gz -P ./{directory}/ ; gzip -d ./{directory}/hairpin_high_conf.fa.gz 
!wget {base}/mature.fa.gz -P ./{directory}/            ; gzip -d ./{directory}/mature.fa.gz 
!wget {base}/mature_high_conf.fa.gz -P ./{directory}/  ; gzip -d ./{directory}/mature_high_conf.fa.gz
!wget {base}/miRNA.str.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.str.gz 
!wget {base}/miRNA.xls.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.xls.gz 
!wget {base}/organisms.txt.gz -P ./{directory}/        ; gzip -d ./{directory}/organisms.txt.gz

--2022-08-30 19:21:09--  https://www.mirbase.org/ftp/CURRENT/aliases.txt.gz
Resolving www.mirbase.org (www.mirbase.org)... 130.88.97.249
Connecting to www.mirbase.org (www.mirbase.org)|130.88.97.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480536 (469K) [application/x-gzip]
Saving to: ‘././miRBase/aliases.txt.gz’


2022-08-30 19:21:12 (429 KB/s) - ‘././miRBase/aliases.txt.gz’ saved [480536/480536]

--2022-08-30 19:21:12--  https://www.mirbase.org/ftp/CURRENT/hairpin.fa.gz
Resolving www.mirbase.org (www.mirbase.org)... 130.88.97.249
Connecting to www.mirbase.org (www.mirbase.org)|130.88.97.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1547350 (1.5M) [application/x-gzip]
Saving to: ‘././miRBase/hairpin.fa.gz’


2022-08-30 19:21:15 (615 KB/s) - ‘././miRBase/hairpin.fa.gz’ saved [1547350/1547350]

--2022-08-30 19:21:15--  https://www.mirbase.org/ftp/CURRENT/hairpin_high_conf.fa.gz
Resolving www.mirbase.org (www.mirbase.org)..

In [19]:
mature = fasta_to_df(f'{directory}/mature.fa')
mature_high_conf = fasta_to_df(f'{directory}/mature_high_conf.fa')
mature['trim tag'] = mature['tag'].apply(lambda line: ' '.join(line.split(' ')[:2]))
mature['confidence'] = mature['trim tag'].isin(mature_high_conf['tag'])

In [20]:
mature['organism'] = mature['tag'].apply(lambda x: x[:3])
print(mature.shape)
mature.head(2)

(48885, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
0,cel-let-7-5p MIMAT0000001 Caenorhabditis elega...,UGAGGUAGUAGGUUGUAUAGUU,cel-let-7-5p MIMAT0000001,True,cel
1,cel-let-7-3p MIMAT0015091 Caenorhabditis elega...,CUAUGCAAUUUUCUACCUUACC,cel-let-7-3p MIMAT0015091,True,cel


In [23]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [24]:
items = list(organism['tree'].unique())
items.sort(key=len)
items

['Viruses;',
 'Mycetozoa;',
 'Alveolata;',
 'Metazoa;Porifera;',
 'Metazoa;Cnidaria;',
 'Viridiplantae;Chlorophyta;',
 'Viridiplantae;Embryophyta;',
 'Viridiplantae;Coniferophyta;',
 'Viridiplantae;Magnoliophyta;',
 'Metazoa;Bilateria;Deuterostoma;',
 'Chromalveolata;Heterokontophyta;',
 'Metazoa;Bilateria;Ecdysozoa;Nematoda;',
 'Metazoa;Bilateria;Lophotrochozoa;Annelida;',
 'Metazoa;Bilateria;Lophotrochozoa;Nemertea;',
 'Metazoa;Bilateria;Lophotrochozoa;Mollusca;',
 'Viridiplantae;Magnoliophyta;monocotyledons;',
 'Metazoa;Bilateria;Deuterostoma;Hemichordata;',
 'Metazoa;Bilateria;Deuterostoma;Echinodermata;',
 'Metazoa;Bilateria;Lophotrochozoa;Brachiopoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Hexapoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Crustacea;',
 'Metazoa;Bilateria;Lophotrochozoa;Platyhelminthes;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Chelicerata;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Mandibulata;',
 'Viridiplantae;Magnoliophyta;eudicotyledons;Poaceae;',
 'M

In [25]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" in x)]
print(selectedTree.shape)
selectedTree.head(5)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352
68,ppt,PPT,Physcomitrella patens,Viridiplantae;Embryophyta;,3218
69,smo,SMO,Selaginella moellendorffii,Viridiplantae;Embryophyta;,88036
70,ath,ATH,Arabidopsis thaliana,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3702


In [26]:
#selectedTree = selectedTree[selectedTree['name'] == ""]

In [27]:
selected = mature[mature['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(1)

(10414, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
316,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...,UGACAGAAGAGAGUGAGCAC,ath-miR156a-5p MIMAT0000166,False,ath


In [28]:
df_to_fasta(selected,f'{temp_path}/mature_microRNA_queries.fasta')

# Remove redundant

## cdhit-est

In [29]:
!./software/cdhit/cd-hit-est -i ./{temp_path_f}/mature_microRNA_queries.fasta  -o ./{temp_path_f}/NR_mature_microRNA_queries.fasta \
    -c 1 -r 0 -G 1 -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS 0 \
    -AS 99999999 -s 0 -S 0

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 23 2021, 21:45:39
Command: ./software/cdhit/cd-hit-est -i
         ././Experiment/A.indica/Temp/mature_microRNA_queries.fasta
         -o
         ././Experiment/A.indica/Temp/NR_mature_microRNA_queries.fasta
         -c 1 -r 0 -G 1 -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS
         0 -AS 99999999 -s 0 -S 0

Started: Sat Dec 10 16:25:54 2022
                            Output                              
----------------------------------------------------------------
total seq: 10414
longest and shortest : 28 and 17
Total letters: 222978
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 1 X 12M = 12M
Table           : 1 X 16M = 16M
Miscellaneous   : 0M
Total           : 30M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 96149440

comparing sequences from          0  to      10414
..........    10000  finished       5817 

## reformat

In [46]:
with open(f'./{temp_path}/NR_mature_microRNA_queries.fasta.clstr','r') as file:
    text = file.read()
lines = [line for line in text.split('\n') if len(line) > 0]
cluster = []
seqid = []
last_cluster = ""
for l in lines:
    if(l[0]=='>'):        
        last_cluster = l.replace('>Cluster ',"C")
    else:        
        cluster.append(last_cluster)
        seqid.append(l.split(', >')[1].split('...')[0])                
seq2cluster = pd.DataFrame({'seqid': seqid,'cluster': cluster})
print(seq2cluster.shape)
seq2cluster.head(2)    

(10414, 2)


Unnamed: 0,seqid,cluster
0,cst-miR11332,C0
1,stu-miR7994b-5p,C1


In [47]:
df = fasta_to_df(f"./{temp_path}/mature_microRNA_queries.fasta")
df['accession'] = df['tag'].apply(lambda x : x.split(' ')[0])
seq2cluster = pd.merge(df,seq2cluster,how="inner",left_on='accession',right_on="seqid")
del seq2cluster['data']
seq2cluster = pd.merge(seq2cluster, mature,how="inner",left_on='tag',right_on="tag")[['cluster','seqid','tag','data','confidence']]
seq2cluster = seq2cluster.rename(columns={'data': 'seq'})
print(seq2cluster.shape)
display(seq2cluster.head(2))
seq2cluster.to_csv(f'./{temp_path}/seq2cluster.csv',index=False)

(10414, 5)


Unnamed: 0,cluster,seqid,tag,seq,confidence
0,C5495,ath-miR156a-5p,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...,UGACAGAAGAGAGUGAGCAC,False
1,C1199,ath-miR156a-3p,ath-miR156a-3p MIMAT0031865 Arabidopsis thalia...,GCUCACUGCUCUUUCUGUCAGA,False


In [48]:
# todo: sorted first by cluster then by seqid
seq2cluster.sort_values("cluster").head(2)

Unnamed: 0,cluster,seqid,tag,seq,confidence
9422,C0,cst-miR11332,cst-miR11332 MIMAT0044622 Cucumis sativus miR1...,CUUGUGGGUAAUAGGCUUUCCUUCCUUG,False
7002,C1,stu-miR7994b-5p,stu-miR7994b-5p MIMAT0031188 Solanum tuberosum...,AGUUUAUGCCCAAGUAUAUAAUAUAU,False


In [49]:
df = fasta_to_df(f"./{temp_path}/NR_mature_microRNA_queries.fasta")
df['tag'] = df['tag'].apply(lambda x : x.split(' ')[0])
df = pd.merge(df,seq2cluster,how="inner",left_on='tag',right_on="seqid")[['cluster','data']]

lines = []
df.apply(lambda row: lines.append(f">{row['cluster']}\n{row['data']}\n"),axis=1)
print(df.shape)
with open(f'./{temp_path}/BLASTn_queries.fasta','w') as file:
    file.write(''.join(lines))

(6028, 2)


# BlastN

!sudo apt-get install ncbi-blast+


In [32]:
path = f'./{experiment_dir}/{experiment}/{input_genome_name}'
!makeblastdb -in {path} -dbtype nucl -out ./{temp_path_f}/blastn_database



Building a new DB, current time: 08/28/2022 19:34:57
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/A.indica/Temp/blastn_database
New DB title:  ././Experiment/A.indica/GCA_022749755.1_ASM2274975v1_genomic.fna
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 70 sequences in 6.41305 seconds.


In [122]:
header = 'qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'

In [50]:
!blastn -query ./{temp_path}/BLASTn_queries.fasta \
        -task blastn-short \
        -out ./{temp_path}/BLASTn_result \
        -num_threads {mp.cpu_count()} \
        -db ./{temp_path}/blastn_database \
        -word_size 7 \
        -penalty -3 \
        -reward 1 \
        -gapopen 5 \
        -gapextend 2 \
        -evalue 10 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [123]:
df_blastn = pd.read_csv(f'./{temp_path}/BLASTn_result', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(250364, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C5495,CM040446.1,1,20,5663467,5663486,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.001,40.1,...,0,100.0,1/1,1,1,plus,100,100,20,18270782
1,C5495,CM040446.1,1,20,16309395,16309414,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.001,40.1,...,0,100.0,1/1,1,1,plus,100,100,20,18270782


In [124]:
# alignment length adjustment
def blastn_adjust(row):
    if(row['sstrand'] == "plus"):        
        row['sstart'] = max(1, row['sstart'] - (row['qstart'] - 1))
        row['send'] = min(row['slen'], row['send'] + (row['qlen'] - row['qend']))
    if(row['sstrand'] == "minus"):        
        row['send'] = max(1, row['send'] - (row['qstart'] - 1 ))
        row['sstart'] = min(row['slen'], row['sstart'] + (row['qlen'] - row['qend'] ))
    return row
    
df_blastn = df_blastn.apply(lambda row: blastn_adjust(row), axis=1)

In [125]:
threshold = 3
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(13984, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C5495,CM040446.1,1,20,5663467,5663486,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.001,40.1,...,100.0,1/1,1,1,plus,100,100,20,18270782,0
1,C5495,CM040446.1,1,20,16309395,16309414,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.001,40.1,...,100.0,1/1,1,1,plus,100,100,20,18270782,0


In [126]:
# remore redundancy and hold best one base of Nonconformity value
df_blastn = df_blastn.sort_values(["Nonconformity", "evalue"], ascending = (True, True))
df_blastn = df_blastn.drop_duplicates(subset=['sseqid','sstart', 'qseqid', 'send','sstrand'], keep='first')
df_blastn.to_csv(f'./{temp_path}/filtered_out_blastn.csv')
print(df_blastn.shape)

(13983, 28)


# Result of the blastn to bed file

In [37]:
flanking_value = 200
df = df_blastn[['qseqid', 'sseqid', 'sstart', 'send', 'sstrand','slen']]
df['ones'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ones'] = 1


In [38]:
def switch(row):
    if(row['sstart'] > row['send']):        
        temp = row['sstart']
        row['sstart'] = row['send']
        row['send'] = temp
    return row
df = df.apply(lambda row: switch(row), axis=1)

In [39]:
def convert(inp):
    if(inp == "plus"):
        return "forward"
    if(inp == "minus"):
        return "reverse"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['strand'] = df['sstrand'].apply(lambda x: convert(x))

In [40]:
def convert2sign(inp):
    if(inp == "plus"):
        return "+"
    if(inp == "minus"):
        return "-"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['sign'] = df['sstrand'].apply(lambda x: convert2sign(x))

In [41]:
df['hit_length'] = df.apply(lambda row: abs(row['send'] - row['sstart']) + 1 ,axis=1)

## convert sstart and send from location to index (range)

In [42]:
df['sstart'] = df['sstart'].apply(lambda x: x - 1)

In [43]:
df['downstream_flanking'] = df['sstart'].apply(lambda x:  flanking_value if x > flanking_value else x)

In [44]:
df['upstream_flanking'] = df.apply(lambda row:  flanking_value if (row['send']+flanking_value) <= row['slen'] else row['slen'] - row['send'],axis=1)

In [45]:
df['hit_start'] = df.apply(lambda row: row['downstream_flanking'] if row['sign'] == "+" else row['upstream_flanking'],axis=1)

In [46]:
df['hit_end'] = df.apply(lambda row: row['downstream_flanking'] + row['hit_length'] if row['sign'] == "+" else row['upstream_flanking'] + row['hit_length'],axis=1)

In [47]:
df['sstart'] = df['sstart'].apply(lambda x: max(x - flanking_value, 0))
df['send'] = df.apply(lambda row: min(row['send'] + flanking_value , row['slen']),axis=1)

In [48]:
df['tag'] = df.apply(lambda row: f">{row['sseqid']}:{row['sstart']}-{row['send']}({row['sign']})",axis=1)
df['reformated_tag'] = df['tag'].apply(lambda t: reformat(t))
df[['tag', 'reformated_tag', 'hit_start', 'hit_end']].to_csv(f'./{temp_path}/hit_index_info.csv')#, index=False)

In [49]:
df['location_tag'] = df.apply(lambda row: f">{row['sseqid']}|{row['sign']}|{row['sstart'] + 1}-{row['send']}|{row['hit_start']+1}-{row['hit_end']}",axis=1)
df[['location_tag','qseqid']].to_csv(f'./{temp_path}/pipe_seprated_location_list.csv',index=False,sep='\t')

In [50]:
df[['sseqid','sstart','send','strand','ones', 'sign']].to_csv(f'./{temp_path}/extension_index.bed', 
        index=False, header=False, sep="\t")

# Extention


In [51]:
# !sudo apt-get install bedtools

In [52]:
!bedtools getfasta -fi {input_genome_path} -fo ./{temp_path}/extended_original.txt -s -bed ./{temp_path}/extension_index.bed
!rm input_genome.fna.fai

rm: cannot remove 'input_genome.fna.fai': No such file or directory


In [53]:
# todo: remove duplicated
'''
df = fasta_to_df("./Temp/extended.txt")
df = df.drop_duplicates(subset=['tag'], keep='first')
df_to_fasta(df,"./Temp/extended.txt")
len(df['tag'].unique())
''';

# Convert hit region to upper case and other region to lower case

In [54]:
ext = fasta_to_df(f'./{temp_path}/extended_original.txt')
info = pd.read_csv(f'./{temp_path}/hit_index_info.csv')
info['tag'] = info['tag'].apply(lambda x: x[1:])
print(info.shape)
info.head(2)

(13983, 5)


Unnamed: 0.1,Unnamed: 0,tag,reformated_tag,hit_start,hit_end
0,114306,CM040440.1:7209019-7209443(-),>CM0404401_7209019-7209443_-_,200,224
1,232281,CM040440.1:20398572-20398996(-),>CM0404401_20398572-20398996_-_,200,224


In [55]:
ext = ext.sort_values(by=['tag']).reset_index()
ext['help_tag'] = ext.apply(lambda r: r['tag'] + str(r.name),axis=1)
del ext['tag']

info = info.sort_values(by=['tag']).reset_index()
info['help_tag'] = info.apply(lambda row: row['tag']+ str(row.name),axis=1)
def redefined_tag(row):
    tag = row['tag']
    [sstart, send] = tag.split(':')[-1].split('(')[0].split('-')
    sstart = int(sstart) + 1
    sign = tag.split('(')[-1].split(')')[0]    
    return f"{tag.split(':')[0]}|{sign}|{sstart}-{send}|{row['hit_start']+1}-{row['hit_end']}"
info['tag'] = info.apply(lambda row: redefined_tag(row),axis=1)
ext = pd.merge(ext,info,how='inner', on='help_tag')

def emphasis_hit(row):
    seq = list(row['data'].lower())            
    s = row['hit_start']
    e = row['hit_end']
    seq[s:e] = list(''.join(seq[s:e]).upper())    
    return ''.join(seq)
    
ext['data'] = ext.apply(lambda row: emphasis_hit(row),axis=1)
#ext = ext.drop_duplicates(subset=['data'], keep='first')
df_to_fasta(ext[['tag','data']],f"./{temp_path}/extended_modified.txt")

# Protein coding elimination [Download nr]

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz

# Protein coding elimination [Diamond]

In [57]:
#!wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
#!tar xzf diamond-linux64.tar.gz

In [None]:
!./diamond makedb --in ./NR/nr -d ./Temp/diamond_output

In [93]:
!./diamond blastx -d ./Temp/diamond_output.dmnd \
                  -q ./Temp/extended_modified.txt \
                  -o ./Temp/diamond_matches.tsv \
                  -p 22

/bin/bash: ./diamond: No such file or directory


In [None]:
'''
Total time = 3093.4s
Reported 100956 pairwise alignments, 100956 HSPs.
4758 queries aligned.
'''

In [56]:
dmn = pd.read_csv(f"./{temp_path}/diamond_matches.tsv", sep='\t', header=None)
dmn.columns = 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'.split(' ')
coding_seq = dmn['qseqid'].unique()

In [57]:
def clear(inp):
    if(inp[:9] == "reverse::"):
        return inp[9:]
    if(inp[:9] == "forward::"):
        return inp[9:]
    return inp
coding_seq = pd.Series(coding_seq).apply(lambda x : clear(x))

ext = fasta_to_df(f'./{temp_path}/extended_modified.txt')
print(f'total:      {ext.shape[0]}')
non_coding = ext[~ext['tag'].isin(coding_seq)]
print(f'non_coding: {non_coding.shape[0]}')
df_to_fasta(non_coding,f'./{temp_path}/extended_modified_non_coding.txt')
coding = ext[ext['tag'].isin(coding_seq)]
print(f'coding:     {coding.shape[0]}')
df_to_fasta(coding,f'./{temp_path}/extended_modified_coding.txt')

total:      13983
non_coding: 9225
coding:     4758


# RNA 2d prediction

## Mfold

In [None]:
'''
# installation
!wget http://www.unafold.org/download/mfold-3.6.tar.gz
!tar -xvf ./mfold-3.6.tar.gz; rm ./mfold-3.6.tar.gz
%cd ./mfold-3.6
!./configure
!make
!make install
%cd ..
!sudo apt install texlive-font-utils
''';

In [59]:
counter = 0
base = f"{result_path}/secondary_structure/mfold/"
!rm -r {base}
!mkdir -p {base}
df = fasta_to_df(f'./{temp_path}/extended_modified_non_coding.txt')

for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)            
    with open(base + f"{tag}/SEQ.FASTA",'w') as file:
        file.write(f">{row['tag']}\n{row['data']}")
    counter += 1    
    #if(counter >= 100):
      #  break

rm: cannot remove './Experiment/A.indica/Result/secondary_structure/mfold/': No such file or directory


In [60]:
%%capture
remove_lock = False
def run_mfold(tag):
    tag = reformat(tag)
    %cd {base + tag}
    !mfold  SEQ="SEQ.FASTA" T=22   
    if(not remove_lock):
        !find . -name "SEQ*" -not -name "*.ct" -not -name "*.pdf" -not -name "*SEQ.FASTA" -not -type d -delete
    %cd {current_path}

if __name__ == '__main__':        
    pool = mp.Pool(mp.cpu_count() - 3)      
    pool.map(run_mfold, df['tag'])  

## Mxfold2

In [20]:
#!wget https://github.com/keio-bioinformatics/mxfold2/releases/download/v0.1.1/mxfold2-0.1.1.tar.gz
#!pip3 install mxfold2-0.1.1.tar.gz
#!rm mxfold2-0.1.1.tar.gz

In [18]:
!mxfold2 predict {temp_path}/extended_modified_non_coding.txt > {result_path}/secondary_structure/mxfold2_result.txt

/bin/bash: mxfold2: command not found
/bin/bash: deactivate: command not found


In [18]:
df = fasta_to_df(f'{result_path}/secondary_structure/mxfold2_result.txt')
df = df.apply(lambda row: bracket_row(row) , axis=1)
df.head(2)

Unnamed: 0,tag,data,bracket
0,CM040440.1|-|1005657-1006076|201-220,gtatttataattttatgttttacttaatttataagtaattaagagg...,.................................................
1,CM040440.1|-|10072337-10072758|201-222,tcgtttttatcgtaaaatttattattttttaagaccctttttgtga...,.................................................


In [19]:
base = f"{result_path}/secondary_structure/mxfold2/"
!rm -r {base}
!mkdir -p {base}
for index, row in df.iterrows():    
    if(not os.path.exists(base + reformat(row['tag']))):
        os.makedirs(base + reformat(row['tag']))        
    tag = reformat(row['tag'])
    with open(base + f"{tag}/{tag}.ct",'w') as file:
        bracket = row['bracket'].split(' ')[0]
        deltaG = row['bracket'].split(' ')[1]
        ct = bracket_to_ct(row['tag'], row['data'], bracket, deltaG)
        file.write(ct)    

rm: cannot remove '../experiment/A.indica/Result/secondary_structure/mxfold2/': No such file or directory


## Vienna package

In [None]:
#!wget https://www.tbi.univie.ac.at/RNA/download/ubuntu/ubuntu_20_04/viennarna_2.4.18-1_amd64.deb -O viennarna.deb
#!sudo dpkg -i ./viennarna.deb
#!sudo apt-get -f install
#!rm viennarna.deb

In [22]:
base = f"{result_path}/secondary_structure/viennarna/"
!rm -r {base}
!rm {result_path}/secondary_structure/viennarna_result.txt
!mkdir -p {base}

rm: cannot remove '../experiment/A.indica/Result/secondary_structure/viennarna_result.txt': No such file or directory


In [23]:
#%cd {base}
!RNAfold --jobs=0 --infile {temp_path}/extended_modified_non_coding.txt  --noPS -T 22 > {base}/viennarna_result.txt
#%cd {current_path}

In [62]:
df = fasta_to_df(f'{result_path}/secondary_structure/viennarna/viennarna_result.txt')
df['data'] = df['data'].apply(lambda x: x.replace("u", "t").replace("U", "T"))
df = df.apply(lambda row: bracket_row(row) , axis=1)
print(df.shape)
df.head(2)

(9225, 3)


Unnamed: 0,tag,data,bracket
0,CM040440.1|-|1005657-1006076|201-220,gtatttataattttatgttttacttaatttataagtaattaagagg...,(((((((...(((((((((((.(((((((.......)))))))..)...
1,CM040440.1|-|10072337-10072758|201-222,tcgtttttatcgtaaaatttattattttttaagaccctttttgtga...,..(..((((((((((((....(((.....)))......))))))))...


In [63]:
for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)      
    with open(base + f"{tag}/{tag}.ct",'w') as file:
        bracket = row['bracket'].split(' ')[0]
        deltaG = row['bracket'].split(' ')[1:]   
        deltaG = "".join(deltaG).replace(" ", "").replace("(", "").replace(")", "")
        try:
            ct = bracket_to_ct(row['tag'], row['data'], bracket, deltaG, False)
            file.write(ct)    
        except:            
            print("This bracket structure is not allowed: ", row['bracket'], "\n")

## ContraFold

In [18]:
#!wget http://contra.stanford.edu/contrafold/contrafold_v2_02.tar.gz
#!tar -xvzf contrafold_v2_02.tar.gz && rm contrafold_v2_02.tar.gz
#%cd contrafold/src
#!make clean
#!make 
# to file must changed to be complieable # utility.hpp and optimization.c++ files

In [19]:
counter = 0
base = f"{result_path}/secondary_structure/contrafold/"
!rm -r {base}
!mkdir -p {base}
df = fasta_to_df(f'{temp_path}/extended_modified_non_coding.txt')

for index, row in tqdm(df.iterrows()):    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)            
    with open(base + f"{tag}/{tag}.FASTA",'w') as file:
        file.write(f">{row['tag']}\n{row['data']}")
    counter += 1        

0it [00:00, ?it/s]

In [20]:
def run_contrafold(tag):        
    tag = reformat(tag)            
    #%cd ../software/contrafold/src
    !../software/contrafold/src/contrafold predict "{base}{tag}/{tag}.FASTA" > "{base}{tag}/{tag}.dot"
    with open(f"{base}{tag}/{tag}.dot", 'r') as file:
        text = file.read()
    text = [l for l in text.split("\n") if l[:len(">structure")] != ">structure"]    
    header = text[0]
    with open(f"{base}{tag}/{tag}.dot", 'w') as file:
        file.write('\n'.join(text[1:]))    
    !RNAeval  "{base}{tag}/{tag}.dot" -T 22 > "{base}{tag}/{tag}.dotdg"
    with open(f"{base}{tag}/{tag}.dotdg", 'r') as file:
        text = file.read()
    with open(f"{base}{tag}/{tag}.dot", 'w') as file:
        file.write(header + "\n" + text)    
    
    df = fasta_to_df(f'{base}{tag}/{tag}.dot')
    df = df.apply(lambda row: bracket_row(row) , axis=1)        
    tag = reformat(df['tag'][0])
    with open(f'{base}{tag}/{tag}.ct','w') as file:
        bracket = df['bracket'][0].split(' ')[0]                
        deltaG = df['bracket'][0].split(' ')[1:]   
        deltaG = "".join(deltaG).replace(" ", "").replace("(", "").replace(")", "")
        ct = bracket_to_ct(df['tag'][0], df['data'][0], bracket, deltaG, False)
        file.write(ct)    
    #!rm ../..{base[1:]}{tag}/{tag}.dot
    #!rm ../..{base[1:]}{tag}/{tag}.dotdg
    #!rm ../..{base[1:]}{tag}/{tag}.FASTA
    #%cd {current_path}        

if __name__ == '__main__':        
    pool = mp.Pool(mp.cpu_count() - 1)      
    pool.map(run_contrafold, df['tag'])



# CTAnalizer

In [20]:
# only select those not ran before
base = f"{result_path}/secondary_structure/{ss_method}/"
df = fasta_to_df(f'./{temp_path}/extended_modified_non_coding.txt')
index_list =[]
for index, row in df.iterrows():    
    tag = reformat(row['tag'])    
    if(len(glob.glob(f'{base + tag}/*.ct')) != 0):
        index_list.append(index)
df = df.iloc[index_list,:]
print(df.shape)

(9225, 2)


In [21]:
def run(tag, path, extra):              
    return get_row(tag, path,extra)
    try:
        return get_row(tag, path,extra)
    except Exception as e:
        print(str(e), tag)        
        return pd.Series()
        
def get_df_by_tag(tag , extra=0):           
    ct_files = glob.glob(f'{base}{reformat(tag)}/*.ct')    
    ct_files = [i for i in ct_files if i[-6:] != 'SEQ.ct']
    return pd.Series(ct_files).apply(lambda path: run(tag, path,extra))    

# Apply on current data

In [22]:
seq2cluster = pd.read_csv(f"{temp_path}/seq2cluster.csv")
seq2cluster['tag'] = seq2cluster.groupby(['cluster'])['tag'].transform(lambda x: ','.join(x))
seq2cluster['seqid'] = seq2cluster.groupby(['cluster'])['seqid'].transform(lambda x: ','.join(x))
seq2cluster = seq2cluster.drop_duplicates()
tag2cluster = pd.read_csv(f'./{temp_path}/pipe_seprated_location_list.csv',sep='\t')
tag2cluster['location_tag'] = tag2cluster['location_tag'].apply(lambda x : x[1:])
data = pd.merge(seq2cluster, tag2cluster, how='inner', left_on='cluster', right_on='qseqid')
data['Reference miRNA cluster'] = data['cluster']
data['Reference miRNA IDs'] = data['seqid']
data['Reference miRNA IDs and species'] = data['tag']
data['Reference miRNA seq'] = data['seq']
data = data[['location_tag','Reference miRNA cluster','Reference miRNA seq', 'Reference miRNA IDs', 'Reference miRNA IDs and species', 'confidence']]

In [23]:
rcols_ref = ['Reference miRNA cluster',
             'Reference miRNA IDs',
             'Reference miRNA IDs and species',
             'Reference miRNA seq']
rcols_boi = ['boi seq', 'boi name', 'boi dotbracket']

rcols = [*rcols_ref,
         *rcols_boi]

rcols_dg = [*rcols, 'delta G']

def selection(row):        
    global repeted
    tuple_row = tuple(row)            
    if(tuple_row not in repeted):
        repeted[tuple_row] = row.name
        return True
    return False

def boi_selection(row):    
    global repeted_boi        
    tuple_row_boi = tuple(row[rcols_boi])
    dg = row['delta G']
    if(tuple_row_boi not in repeted_boi):
        repeted_boi[tuple_row_boi] = { 
            "counter": 1,
            "dg": dg,
            "ref clusters": row['Reference miRNA cluster'],
            "ref seq": row['Reference miRNA seq'],
            "ref ids": row['Reference miRNA IDs'],
            "ref species": row['Reference miRNA IDs and species'],
            "lock": False
        }
    else:            
        value =  repeted_boi[tuple_row_boi]
        value['counter'] += 1
        value['dg'] = min(value['dg'] , dg)
        value['ref clusters'] += "," + row['Reference miRNA cluster']
        value['ref seq'] += "," + row['Reference miRNA seq']
        value['ref ids'] += "," + row['Reference miRNA IDs']
        value['ref species'] += "," +  row['Reference miRNA IDs and species']                
        repeted_boi[tuple_row_boi] = value        

In [24]:
!rm ./{result_path}/{ss_method}_ct_analizer.csv
chunksize = 1 * (10 ** 4)
max_workers = mp.cpu_count() - 4
num_terminal = 5 # acceptable_terminal_structures

repeted = {}
repeted_boi = {}
header = True
orders = None
arr = np.array_split(df['tag'], max(df['tag'].shape[0]//chunksize , 1))
for chunk in tqdm(arr):
    dfs = []
    for row in process_map(get_df_by_tag , chunk, tqdm_class=tqdm, max_workers=max_workers, chunksize=5):        
        dfs.append(row)
    chunk = pd.concat(dfs,axis=0)
    chunk = pd.merge(data, chunk, how='right', left_on = 'location_tag', right_on ='seq name')
    del chunk['location_tag']
    if(header):
        orders = chunk.columns
    for col in orders:
        if(col not in chunk.columns):
            chunk[col] = np.nan            
        
    for col in chunk.columns:
        if(col not in orders ):
            print(f"Error in {col}")        
    chunk = chunk.reindex(columns=orders)
    chunk = chunk.replace(np.nan, '-').replace('', '-')                    
    # delete repeated
    selected = chunk[rcols].apply(lambda row: selection(row), axis=1)
    chunk = chunk[selected]
    
    # cluster refs
    chunk[rcols_dg].apply(lambda row: boi_selection(row), axis=1)        
    chunk.to_csv(f"./{result_path}/{ss_method}_ct_analizer.csv", header=header, mode='a', index=False)    
    header = False

rm: cannot remove './../experiment/A.indica/Result/mxfold2_ct_analizer.csv': No such file or directory


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9225 [00:00<?, ?it/s]

In [25]:
!rm ./{result_path}/{ss_method}_ct_analizer_clustered.csv
def isKeepCluster(row):
    global repeted_boi    
    dg = row['delta G']    
    if(row["boi name"] == "-"):
        return False
    tuple_row_boi = tuple(row[rcols_boi])    
    value = repeted_boi[tuple_row_boi]
    if(value['counter'] == 1):
        return True
    if(value['dg'] != dg):        
        return False
    if(value['lock']):
        return False
    value['lock'] = True
    repeted_boi[tuple_row_boi] = value
    return True


def makeCluster(row):            
    tuple_row_boi = tuple(row[rcols_boi])    
    value = repeted_boi[tuple_row_boi]
    if(value['counter'] != 1):
        for ref_c in ['ref clusters','ref seq', 'ref ids', 'ref species']:
            value[ref_c] = value[ref_c].replace(' ,', ',').replace(', ', ',')                        
            value[ref_c] = value[ref_c].split(',')
            value[ref_c] = set(value[ref_c])
            value[ref_c] = ", ".join(value[ref_c])                    
        row['Reference miRNA cluster'] = value["ref clusters"]
        row['Reference miRNA seq'] = value["ref seq"]
        row['Reference miRNA IDs'] = value["ref ids"]
        row['Reference miRNA IDs and species'] = value["ref species"]
    return row


header = True
for chunk in tqdm(pd.read_csv(f"./{result_path}/{ss_method}_ct_analizer.csv", chunksize=10 ** 5)):
    chunk = chunk[chunk[rcols_dg].apply(lambda row:  isKeepCluster(row), axis=1)]
    chunk = chunk.apply(lambda row : makeCluster(row), axis=1)
    chunk.to_csv(f"./{result_path}/{ss_method}_ct_analizer_clustered.csv", mode='a', index=False)
    header = False

rm: cannot remove './../experiment/A.indica/Result/mxfold2_ct_analizer_clustered.csv': No such file or directory


0it [00:00, ?it/s]

# Filters

In [26]:
!rm ./{result_path}/{ss_method}_result_level1_filter.csv
filter_run(input_file=f"./{result_path}/{ss_method}_ct_analizer_clustered.csv", output_file= f"./{result_path}/{ss_method}_result_level1_filter.csv")

rm: cannot remove './../experiment/A.indica/Result/mxfold2_result_level1_filter.csv': No such file or directory


1it [00:00,  5.24it/s]


In [27]:
# store mu and std
with open('../data/classifier/mu.pickle', 'rb') as handle:
    mu = pickle.load(handle)    
with open('../data/classifier/std.pickle', 'rb') as handle:
    std = pickle.load(handle)

In [28]:
filter1 = pd.read_csv(f"./{result_path}/{ss_method}_result_level1_filter.csv")
filter1.shape[0]

40

In [29]:
filter1 = pd.read_csv(f"{result_path}/{ss_method}_result_level1_filter.csv")
filter1 = convert(filter1)          
[feature,_, _] = preprocessing(filter1, mu, std)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[c].replace([np.inf], m, inplace=True)


In [30]:
model = load_model('../data/classifier/model.h5')
pred = model.predict(feature)
pred = pd.DataFrame(pred)[1]
filter1['pred'] = pred



In [31]:
candidates = filter1[filter1['pred'] > classifier_threshold]
candidates = candidates.reset_index(drop=True)
print(candidates.shape)
len(candidates['hit seq'].unique())
candidates.to_csv(f"./{result_path}/{ss_method}_candidates.csv", index=False)

(0, 185)


In [32]:
config = {
    'delta_g_min': -999,
    'delta_g_max': 0,
    'hit_len_min': 20,
    'hit_len_max': 24,
    'hit_complementarity_percentage_min': 0.7,
    'hit_complementarity_percentage_max': 1.0,
    'number_of_terminal_structure_min': 1,
    'number_of_terminal_structure_max': 3,
    'boi_gc_content_min': 25,
    'boi_gc_content_max': 75,
    'num_of_linking_residues_min': 10,
    'num_of_linking_residues_max': 200,
    'hit_gc_content_percentage_min': 25,
    'hit_gc_content_percentage_max': 75,
    'precursor_mfei_min': 0.0,
    'precursor_mfei_max': 3.0,
    'border_line_mismatch_max': 100,
    'border_line_bulge_max': 100,
    'border_line_internal_max': 100,
    'total_num_of_nonmatching_positions': 5,
    'total_num_of_mismached_positions': 100,
    'total_num_of_positions_in_bulges_and_loops': 3,
    'max_allowed_mismatch_size_in_hit_region': 3,
    'max_allowed_bulge_size_in_hit_region': 2,
    'max_allowed_internal_loop_size_in_hit_region': 3,
    'max_allowed_hsbl_ssbl_size': 7,
    'minimum_required_clear_region': 15,
    'acceptable_num_for_hit_locations_in_bulges_or_loops': 3,
    'acceptable_num_for_unmatched_locations_in_hit_region': 3,
    'delete_if_mature_duplex_involvement_in_apical_loop': 'YES',
    'border_line_structure_allowance': '1 END ONLY'
}

postprocess(input_file  = f"{result_path}/{ss_method}_candidates.csv",
            output_file = f"{result_path}/{ss_method}_candidates_postprocessed.csv",
            config=config)
candidates = pd.read_csv(f"{result_path}/{ss_method}_candidates_postprocessed.csv")
print(candidates.shape)
len(candidates['hit seq'].unique())

0it [00:00, ?it/s]


KeyError: "None of [Index(['mismatch type', 'mismatch size'], dtype='object')] are in the [columns]"

In [None]:
candidates = pd.read_csv(f"./{result_path}/{ss_method}_candidates.csv")
candidates_post = pd.read_csv(f"./{result_path}/{ss_method}_candidates_postprocessed.csv")
a = candidates['seq name'] + candidates['ct name']
b = candidates_post['seq name'] + candidates_post['ct name']
candidates[~a.isin(b)].to_csv(f"./{result_path}/{ss_method}_candidates_rejected_in_postprocessed.csv", index=False)

# Cluster JSC

In [35]:
result = candidates_post
print(result.shape)
result.head(2)

(141, 185)


Unnamed: 0,Reference miRNA cluster,Reference miRNA seq,Reference miRNA IDs,Reference miRNA IDs and species,confidence,seq name,ct name,ct,pdf,hit start,...,acceptable num for hit locations in bulges or loops mayers,acceptable num for unmatched locations in hit region mayers * 2,acceptable num for unmatched locations in hit region mayers,total num of mismached positions,total num of nonmatching positions,total num of positions in bulges and loops,mature duplex involvement in apical loop,sum border proximal,sum border distal,pred
0,"C5727, C5548, C5785, C5791","AUUGGACUGAAGGGAGCUCC, CUUGGAUUGAAGGGAGCUCC, CU...","mdm-miR159a, sly-miR319a, cpa-miR159b, mdm-miR...",ppt-miR319a MIMAT0003133 Physcomitrella patens...,False,CM040440.1|+|16143533-16143952|201-220,Fold 00,"=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...","=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...",201,...,0.0,6.0,3.0,2.0,2.0,0.0,True,1.0,0.0,0.994842
1,"C2473, C2724, C2385, C5096, C2373, C2506, C295...","AUUGGAGUGAAGGGAGCUCUG, AUUGGAGUGAAGGGAGCUCCA, ...","cas-miR159a, vvi-miR159a, pta-miR159c, atr-miR...",ath-miR159c MIMAT0001015 Arabidopsis thaliana ...,False,CM040440.1|+|16143533-16143953|201-221,Fold 00,"=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...","=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...",201,...,0.0,6.0,3.0,2.0,2.0,0.0,True,1.0,0.0,0.995142


In [36]:
def jaccard(A, B):
    a1 = int(A.split('-')[0])
    a2 = int(A.split('-')[1])
    b1 = int(B.split('-')[0])
    b2 = int(B.split('-')[1])
    s1 = set([i for i in range(min(a1,a2), max(a1,a2) + 1)])
    s2 = set([i for i in range(min(b1,b2), max(b1,b2) + 1)])    
    intersection = len(s1.intersection(s2))
    union = (len(s1) + len(s2)) - intersection
    return float(intersection) / union

In [37]:
def same2cluster(same_dict, threshold=0.8):        
    counter = 1
    for key in same_dict:        
        item2cluster = {}
        SET = list(set(same_dict[key]))
        G = networkx.Graph()
        # add nodes
        for s in SET:
            G.add_node(s)
        # add edges
        for i in range(0, len(SET)):
            for j in range(i+1, len(SET)):
                if(jaccard(SET[i],SET[j]) >= threshold):        
                    G.add_edge(SET[i], SET[j], weight=jaccard(SET[i],SET[j]))
                            
        # get maximal
    
        for clique in maximal_cliques(G):    
            unique = str(counter).zfill(4)
            counter += 1
            for item in clique:
                if(item in item2cluster):
                    item2cluster[item].append(unique)
                else:
                    item2cluster[item] = [unique]    
        same_dict[key] = item2cluster
    return same_dict 

## hit jaccard similarity

In [38]:
hit_threshold = 1
same_strand_hit = {}

def same_strand(row):    
    global same_strand_hit
    chrom = row['chromosome']
    sign = row['sign']
    hit = row['hit position on chromosome']      
    key = f'{chrom}{sign}'
    if(key in same_strand_hit):        
        same_strand_hit[key].append(hit)
    else:
        same_strand_hit[key] = [hit]
        
rcols_hit = ['chromosome', 'sign', 'hit position on chromosome']
result[rcols_hit].apply(lambda row: same_strand(row), axis=1)
same_strand_hit = same2cluster(same_strand_hit, threshold=hit_threshold)

def _f(row):        
    key = f"{row['chromosome']}{row['sign']}"
    return same_strand_hit[f'{key}'][row['hit position on chromosome']]
result['hit cluster number'] = result[rcols_hit].apply(lambda row: _f(row),axis=1)

In [39]:
rcols_boi = ['boi seq', 'boi name', 'boi dotbracket']
boi_threshold = 0.8
same_strand_boi = {}

def same_strand(row):    
    global same_strand_boi    
    seq = row['boi seq'].lower()
    name = row['boi name']
    dotbracket = row['boi dotbracket'].lower()
    if(pd.isna(name) or name == "-"):
        return 
    name = name.split('|')    
    loction = name[0] + name[1] + name[2]
    hit = name[3]
    key = f'{seq}{dotbracket}{loction}'
    if(key in same_strand_boi):        
        same_strand_boi[key].append(hit)
    else:
        same_strand_boi[key] = [hit]
        
        
result[rcols_boi].apply(lambda row: same_strand(row), axis=1)

same_strand_boi = same2cluster(same_strand_boi, threshold=boi_threshold)

def _f(row):        
    seq = row['boi seq'].lower()
    name = row['boi name']
    dotbracket = row['boi dotbracket'].lower()
    if(pd.isna(name) or name == "-"):
        return 
    name = name.split('|')    
    loction = name[0] + name[1] + name[2]
    hit = name[3]
    key = f'{seq}{dotbracket}{loction}'        
    return same_strand_boi[f'{key}'][hit]
result['boi cluster number'] = result[rcols_boi].apply(lambda row: _f(row),axis=1)

In [40]:
precursor_threshold = 0.8
same_strand_precursor = {}
rcols_pre = ['precursor seq', 'precursor name', 'precursor dotbracket']

def same_strand(row):    
    global same_strand_precursor    
    seq = row['precursor seq'].lower()
    name = row['precursor name']
    dotbracket = row['precursor dotbracket'].lower()
    if(pd.isna(name) or name == "-"):
        return 
    name = name.split('|')    
    loction = name[0] + name[1] + name[2]
    hit = name[3]
    key = f'{seq}{dotbracket}{loction}'
    if(key in same_strand_precursor):        
        same_strand_precursor[key].append(hit)
    else:
        same_strand_precursor[key] = [hit]
        
        
result[rcols_pre].apply(lambda row: same_strand(row), axis=1)
same_strand_precursor = same2cluster(same_strand_precursor, threshold=precursor_threshold)

def _f(row):        
    seq = row['precursor seq'].lower()
    name = row['precursor name']
    dotbracket = row['precursor dotbracket'].lower()
    if(pd.isna(name) or name == "-"):
        return 
    name = name.split('|')    
    loction = name[0] + name[1] + name[2]
    hit = name[3]
    key = f'{seq}{dotbracket}{loction}'
    return same_strand_precursor[f'{key}'][hit]
result['precursor cluster number'] = result[rcols_pre].apply(lambda row: _f(row),axis=1)

In [41]:
hit2cluster = {}
hit_unique = result['hit seq'].unique()
for i in range(0, hit_unique.shape[0]):
    hit2cluster[hit_unique[i]] = str(i + 1).zfill(4)
result['identical hit cluster'] = result['hit seq'].apply(lambda hit: hit2cluster[hit])

In [42]:
seed_start = 2
seed_end = 13
result['seed region'] = result['hit seq'].apply(lambda hit: hit[seed_start-1:seed_end])

In [43]:
result.to_csv(f"./{result_path}/{ss_method}_candidates_postprocessed_clustered.csv",index=False)
!zip -r ./{result_path}/{ss_method}_candidates_postprocessed_clustered.zip ./{result_path}/{ss_method}_candidates_postprocessed_clustered.csv

  adding: ../experiment/A.indica/Result/contrafold_candidates_postprocessed_clustered.csv (deflated 89%)


# seq2cluster

In [44]:
result = pd.read_csv(f"{result_path}/{ss_method}_candidates_postprocessed_clustered.csv")
seq2cluster = pd.read_csv(f"{temp_path}/seq2cluster.csv")

In [45]:
cluster_list = []
for index, row in result.iterrows():
    cluster = row['Reference miRNA cluster'].split(", ")    
    for c in cluster:        
        cluster_list.append(c)
cluster_list = set(cluster_list)

In [46]:
def get(cluster):
    return seq2cluster[seq2cluster["cluster"] == cluster].iloc[0]['seq']

fasta = ""
for cluster in cluster_list:
    fasta += f">{cluster}\n{get(cluster)}\n"    
    
with open(f"{result_path}/{ss_method}_involved clusters seq.fasta", "w") as file:
    file.write(fasta)

In [47]:
seq2cluster = seq2cluster[seq2cluster['cluster'].isin(cluster_list)]
seq2cluster.to_csv(f"{result_path}/{ss_method}_involved clusters catalog.csv")

In [48]:
def get(cluster):
    return seq2cluster[seq2cluster["cluster"] == cluster].iloc[0]['seq']

def getCluster(row):    
    for c in row['Reference miRNA cluster'].split(", "):
        seq = get(c)
        seq = seq.replace("U", "T")
        if(seq == row['hit seq']):
            return c 
    return "none"
        
result["seq loyalty"] = result.apply(lambda row: getCluster(row), axis=1)

In [49]:
result.to_csv(f"{result_path}/{ss_method}_candidates_postprocessed_clustered.csv", index=None)
!zip -r ./{result_path}/{ss_method}_candidates_postprocessed_clustered.zip ./{result_path}/{ss_method}_candidates_postprocessed_clustered.csv

updating: ../experiment/A.indica/Result/contrafold_candidates_postprocessed_clustered.csv (deflated 89%)


# Final report 

In [50]:
result = pd.read_csv(f"{result_path}/{ss_method}_candidates_postprocessed_clustered.csv")
result = result.sort_values('pred', ascending=False)

In [51]:
print(result.shape)
result.drop_duplicates(subset=["precursor name", "precursor dotbracket"], keep="first", inplace=True)
print(result.shape)

(141, 191)
(141, 191)


In [52]:
result["confidence"] = result["confidence"].apply(lambda x: "confident" if x else "not confident")

In [53]:
result = result[["identical hit cluster", "pred", "hit seq", "chromosome", "sign", "hit position on chromosome", "precursor name", "precursor seq", "precursor seq visualization", "Reference miRNA cluster", "Reference miRNA IDs and species", "confidence", "seq loyalty"]]
result.columns = ["Predicted microRNA number", "Probability", "microRNA sequence", "chromosome", "Strand", "microRNA position on the chromosome", "precursor unique ID", "precursor sequence", "precursor visualization", "Reference miRNA cluster", "Reference miRNA IDs and species", "Reference miRNA confidence", "predicted microRNA is identical with"]
result = result.sort_values(by=["Predicted microRNA number", "Probability"], ascending=[True, False])

In [54]:
result.to_csv(f"{result_path}/{ss_method}_final_result.csv", index=False)

# BlastX

In [None]:
!makeblastdb -in ./NR/nr -dbtype prot -out ./NR/nr_database

#!head -n 100 ./Temp/extended_modified.txt > ./input_blastx.txt

!blastx -query ./input_blastx.txt \
        -db ./NR/nr_database \
        -out ./Temp/BlastX/blastx \
        -num_threads 20 \
        -evalue 1e-3 \
        -outfmt "6 qseqid sseqid qstart qend evalue bitscore score length frames qframe qcovs qcovhsp staxids"

blx = pd.read_csv('./Temp/BlastX/blastx', sep='\t', header=None)
blx.columns = 'qseqid sseqid qstart qend evalue bitscore score length frames qframe qcovs qcovhsp staxids'.split(' ')
coding_seq = blx['qseqid'].unique()