# Config

In [1]:
experiment = "O.sativa_positive"
input_genome_name = "GCF_001433935.1.fna"

experiment_dir = "./Experiment"

input_genome_path = f'{experiment_dir}/{experiment}/{input_genome_name}'

temp_path = f"{experiment_dir}/{experiment}/Temp"
result_path = f"{experiment_dir}/{experiment}/Result"

# Common

In [2]:
import json
import time
from subprocess import Popen, PIPE, STDOUT
import math
import numpy as np
import pandas as pd
import hashlib
import requests
import os, sys, subprocess
from tqdm.contrib.concurrent import process_map
from tqdm.notebook import tqdm
tqdm.pandas()
import multiprocessing as mp
import shutil
import urllib.parse
import glob
import os
import sys
import networkx
from networkx.algorithms.clique import find_cliques as maximal_cliques
from ast import literal_eval
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
sys.path.append("./src/")
from ct_analizer import get_row
from filter1 import filter1_run
from filter2 import filter2
from utils import adjust, bracket_to_ct, fasta_to_df, df_to_fasta,reformat, reformatCT,get_ct_data, ct2dot_bracket, is_nested

In [3]:
if(not os.path.exists(temp_path)):
    os.mkdir(temp_path)
    
if(not os.path.exists(result_path)):
    os.mkdir(result_path)
    
current_path = os.getcwd()

In [4]:
def bracket_row(row):    
    s = row['data']
    index = min(s.find('.'), s.find('('))
    data = row['data']
    row['data'] = data[0:index]
    row['bracket'] = data[index:]
    return row

In [5]:
def isChar(c):
    return not c in [' ', '-', '|']

def adjust(text,n=7):
    text = str(text)    
    return " " * (n - len(text)) + text

def isMature(c1, c2):
    if(c1.isupper() and isChar(c1)):
        return True
    if(c2.isupper() and isChar(c2)):
        return True
    return False

def get_number(strText):
    number = 0 
    for i in range(len(strText[2])):
        if(isChar(strText[2][i]) or isChar(strText[3][i])):
            number += 1    
        if(isChar(strText[4][i])):
            number += 1    
        if(isChar(strText[5][i]) or isChar(strText[6][i])):
            number += 1     
    return number

def get5pInfo(strText):
    has_5p = False
    s_5p = 0
    f_5p = 0
    counter = 0
    for i in range(len(strText[2])):
        if(isChar(strText[2][i]) or isChar(strText[3][i])):
            counter += 1            
            if(isMature(strText[2][i] ,strText[3][i])):
                if(not has_5p):
                    has_5p = True
                    s_5p = counter
                else:
                    continue
            elif(has_5p):
                f_5p = counter - 1 
                break
    return [has_5p, s_5p, f_5p]

def get3pInfo(strText, number):
    has_3p = False
    s_3p = 0
    f_3p = 0
    counter = 0
    for i in range(len(strText[6])):
        if(isChar(strText[6][i]) or isChar(strText[5][i])):
            counter += 1            
            if(isMature(strText[6][i] ,strText[5][i])):
                if(not has_3p):
                    has_3p = True
                    s_3p = counter
                else:
                    continue
            elif(has_3p):
                f_3p = counter - 1 
                break        
    return [has_3p, number - f_3p + 1, number - s_3p + 1]    

def convertor(strText):
    index  = []
    values = []
    nucludid = []
    def add(i, v, n):        
        index.append(i)
        values.append(v)
        nucludid.append(n)
            
    f_counter = 1
    r_counter = 0
                    
    number = get_number(strText)    
    [FO, FI, Mid, RI, RO] = strText[2:]
    # main loop    
    for i in range(len(Mid)):           
        
        if(isChar(FO[i]) and FI[i] == " " and Mid[i] == " " and RI[i] == " "  and RO[i] == "-" ):
            add(f_counter, 0, FO[i])                
            f_counter += 1                      
                
        elif(FO[i] == "-" and FI[i] == " " and Mid[i] == " " and RI[i] == " "  and isChar(RO[i])):                                            
            add(number - r_counter, 0, RO[i])                
            r_counter += 1                      
        
        elif(isChar(FO[i]) and not isChar(FI[i]) and not isChar(Mid[i]) and not isChar(RI[i]) and isChar(RO[i])):
            add(f_counter, 0, FO[i])                    
            add(number - r_counter, 0, RO[i])                                
            f_counter += 1        
            r_counter += 1                              
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and Mid[i] == "|" and isChar(RI[i]) and not isChar(RO[i])):        
            add(f_counter, number - r_counter, FI[i])                        
            add(number - r_counter, f_counter, RI[i])                
            f_counter += 1        
            r_counter += 1                  
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and isChar(Mid[i]) and isChar(RI[i]) and not isChar(RO[i])):
            add(f_counter, 0, FI[i])
            add(f_counter + 1, 0, Mid[i])
            add(number - r_counter, 0, RI[i])
            f_counter += 2
            r_counter += 1
            
        elif(not isChar(FO[i]) and isChar(FI[i]) and Mid[i] == " " and isChar(RI[i]) and not isChar(RO[i])):                
            add(f_counter, 0, FI[i])                    
            add(number - r_counter, 0, RI[i])                    
            f_counter += 1        
            r_counter += 1                  
            
        elif(not isChar(FO[i]) and not isChar(FI[i]) and isChar(Mid[i]) and not isChar(RI[i]) and not isChar(RO[i])):                
            add(f_counter, 0, Mid[i])                                
            f_counter += 1                    

    # sort indexs and other base on indexes
    inds = np.array(index).argsort()    
    nucludid = np.array(nucludid)[inds]       
    return ''.join(nucludid)

# Download data from Mirbase

In [6]:
directory = './miRBase'
base = "https://www.mirbase.org/ftp/CURRENT"        

In [7]:
mature = fasta_to_df(f'{directory}/mature.fa')
mature_high_conf = fasta_to_df(f'{directory}/mature_high_conf.fa')
mature['trim tag'] = mature['tag'].apply(lambda line: ' '.join(line.split(' ')[:2]))
mature['confidence'] = mature['trim tag'].isin(mature_high_conf['tag'])

In [8]:
mature['organism'] = mature['tag'].apply(lambda x: x[:3])
print(mature.shape)
mature.head(2)

(48885, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
0,cel-let-7-5p MIMAT0000001 Caenorhabditis elega...,UGAGGUAGUAGGUUGUAUAGUU,cel-let-7-5p MIMAT0000001,True,cel
1,cel-let-7-3p MIMAT0015091 Caenorhabditis elega...,CUAUGCAAUUUUCUACCUUACC,cel-let-7-3p MIMAT0015091,True,cel


In [9]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [10]:
items = list(organism['tree'].unique())
items.sort(key=len)
items

['Viruses;',
 'Mycetozoa;',
 'Alveolata;',
 'Metazoa;Porifera;',
 'Metazoa;Cnidaria;',
 'Viridiplantae;Chlorophyta;',
 'Viridiplantae;Embryophyta;',
 'Viridiplantae;Coniferophyta;',
 'Viridiplantae;Magnoliophyta;',
 'Metazoa;Bilateria;Deuterostoma;',
 'Chromalveolata;Heterokontophyta;',
 'Metazoa;Bilateria;Ecdysozoa;Nematoda;',
 'Metazoa;Bilateria;Lophotrochozoa;Annelida;',
 'Metazoa;Bilateria;Lophotrochozoa;Nemertea;',
 'Metazoa;Bilateria;Lophotrochozoa;Mollusca;',
 'Viridiplantae;Magnoliophyta;monocotyledons;',
 'Metazoa;Bilateria;Deuterostoma;Hemichordata;',
 'Metazoa;Bilateria;Deuterostoma;Echinodermata;',
 'Metazoa;Bilateria;Lophotrochozoa;Brachiopoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Hexapoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Crustacea;',
 'Metazoa;Bilateria;Lophotrochozoa;Platyhelminthes;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Chelicerata;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Mandibulata;',
 'Viridiplantae;Magnoliophyta;eudicotyledons;Poaceae;',
 'M

In [11]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" in x)]
print(selectedTree.shape)
selectedTree.head(5)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352
68,ppt,PPT,Physcomitrella patens,Viridiplantae;Embryophyta;,3218
69,smo,SMO,Selaginella moellendorffii,Viridiplantae;Embryophyta;,88036
70,ath,ATH,Arabidopsis thaliana,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3702


In [12]:
selectedTree = selectedTree[selectedTree['name'] == "Oryza sativa"]

In [13]:
selected = mature[mature['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(1)

(738, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
1068,osa-miR156a MIMAT0000618 Oryza sativa miR156a,UGACAGAAGAGAGUGAGCAC,osa-miR156a MIMAT0000618,False,osa


In [128]:
with open(f'./{directory}/miRNA.str') as file:
    strRNA = file.read()
    lines = strRNA.split('\n')
nuclutide = []
hit_start = []
hit_end = []
for i in tqdm(range(0,len(lines), 8)):
    tag = lines[i]
    if(tag[1:4] == "osa"):
        strText = lines[i:(i+7)]        
        if(len(strText) == 7):
            nuc = convertor(strText).lower()            
            number = get_number(strText)    
            [has_5p, s_5p, f_5p] = get5pInfo(strText)    
            [has_3p, s_3p, f_3p] = get3pInfo(strText, number)
            if(has_5p):
                if(s_5p > f_5p):                    
                    continue
            if(has_3p):
                if(s_3p > f_3p):                    
                    continue
            if(has_5p):
                nuclutide.append(nuc)
                hit_start.append(s_5p)                
                hit_end.append(f_5p)                
            if(has_3p):
                nuclutide.append(nuc)                
                hit_start.append(s_3p)
                hit_end.append(f_3p)                                                      
precursor = pd.DataFrame({"data": nuclutide, "hit_start": hit_start, "hit_end": hit_end})
precursor['hit'] = precursor.apply(lambda row: row['data'][(row['hit_start']-1):(row['hit_end'])].upper(), axis=1)
precursor['confidence'] = precursor['hit'].apply(lambda hit: selected['confidence'][selected['data'] == hit].sum() > 0)
precursor['tag'] = precursor.index
print(precursor.shape)
precursor.head()

  0%|          | 0/38590 [00:00<?, ?it/s]

(713, 6)


Unnamed: 0,data,hit_start,hit_end,hit,confidence,tag
0,ggagggugacagaagagagugagcacacgugguuguuuccuugcau...,7,26,UGACAGAAGAGAGUGAGCAC,True,0
1,uugucuugagaggggaagagaucucuauggguuuuggaggucugac...,43,62,UGACAGAAGAGAGUGAGCAC,True,1
2,uugucuugagaggggaagagaucucuauggguuuuggaggucugac...,110,130,GCUCACUCUCUAUCUGUCAGC,True,2
3,ggaggaagagaggggugagaggugaggcugacagaagagagugagc...,29,48,UGACAGAAGAGAGUGAGCAC,True,3
4,ggaggaagagaggggugagaggugaggcugacagaagagagugagc...,100,121,GCUCACUUCUCUCUCUGUCAGC,True,4


In [129]:
precursor.to_csv(f'{temp_path}/precursor_info.csv')
df_to_fasta(precursor, f'{temp_path}/mature_microRNA_queries.fasta')

# Remove redundant

## cdhit-est

In [16]:
!./software/cdhit/cd-hit-est -i ./{temp_path}/mature_microRNA_queries.fasta  -o ./{temp_path}/NR_mature_microRNA_queries.fasta \
    -c 1 -r 0 -G 1 -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS 0 \
    -AS 99999999 -s 0 -S 0

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 23 2021, 21:45:39
Command: ./software/cdhit/cd-hit-est -i
         ././Experiment/O.sativa_plus/Temp/mature_microRNA_queries.fasta
         -o
         ././Experiment/O.sativa_plus/Temp/NR_mature_microRNA_queries.fasta
         -c 1 -r 0 -G 1 -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS
         0 -AS 99999999 -s 0 -S 0

Started: Tue Jul 12 15:30:24 2022
                            Output                              
----------------------------------------------------------------
total seq: 713
longest and shortest : 541 and 59
Total letters: 109981
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 1 X 12M = 12M
Table           : 1 X 16M = 16M
Miscellaneous   : 0M
Total           : 29M

Table limit with the given memory limit:
Max number of representatives: 2881052
Max number of word counting entries: 96349954

comparing sequences from          0  to        713

      713  finished        587 

## reformat

In [41]:
with open(f'./{temp_path}/NR_mature_microRNA_queries.fasta.clstr','r') as file:
    text = file.read()
lines = [line for line in text.split('\n') if len(line) > 0]
cluster = []
seqid = []
last_cluster = ""
for l in lines:
    if(l[0]=='>'):        
        last_cluster = l.replace('>Cluster ',"C")
    else:        
        cluster.append(last_cluster)
        seqid.append(l.split(', >')[1].split('...')[0])                
seq2cluster = pd.DataFrame({'seqid': seqid,'cluster': cluster})
print(seq2cluster.shape)
seq2cluster.head(2)    

(713, 2)


Unnamed: 0,seqid,cluster
0,499,C0
1,697,C1


In [18]:
df = fasta_to_df(f"./{temp_path}/mature_microRNA_queries.fasta")
seq2cluster = pd.merge(df,seq2cluster,how="inner",left_on='tag',right_on="seqid")
precursor['tag'] = precursor['tag'].apply(lambda x : str(x))
seq2cluster = pd.merge(seq2cluster, precursor,how="inner",left_on='seqid',right_on="tag")[['cluster','seqid','tag_x', 'confidence']]
seq2cluster.columns = ['cluster','seqid','tag', 'confidence']
print(seq2cluster.shape)
display(seq2cluster.head(2))
seq2cluster.to_csv(f'./{temp_path}/seq2cluster.csv',index=False)

(713, 4)


Unnamed: 0,cluster,seqid,tag,confidence
0,C446,0,0,True
1,C167,1,1,True


In [19]:
# todo: sorted first by cluster then by seqid
seq2cluster.sort_values("cluster").head(2)

Unnamed: 0,cluster,seqid,tag,confidence
499,C0,499,499,False
697,C1,697,697,False


In [20]:
df = fasta_to_df(f"./{temp_path}/NR_mature_microRNA_queries.fasta")
df['tag'] = df['tag'].apply(lambda x : str(x))
df = pd.merge(df,seq2cluster,how="inner",left_on='tag',right_on="seqid")[['cluster','data']]

lines = []
df.apply(lambda row: lines.append(f">{row['cluster']}\n{row['data']}\n"),axis=1)
print(df.shape)
with open(f'./{temp_path}/BLASTn_queries.fasta','w') as file:
    file.write(''.join(lines))

(587, 2)


# BlastN

In [21]:
path = f'./{experiment_dir}/{experiment}/{input_genome_name}'
!makeblastdb -in {path} -dbtype nucl -out ./{temp_path}/blastn_database



Building a new DB, current time: 07/12/2022 15:30:34
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_plus/Temp/blastn_database
New DB title:  ././Experiment/O.sativa_plus/GCF_001433935.1.fna
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 58 sequences in 5.91597 seconds.


In [22]:
header = 'qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'

In [23]:
!blastn -query ./{temp_path}/BLASTn_queries.fasta \
        -out ./{temp_path}/BLASTn_result \
        -num_threads {mp.cpu_count()} \
        -db ./{temp_path}/blastn_database \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [60]:
df_blastn = pd.read_csv(f'./{temp_path}/BLASTn_result', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(1035831, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C446,NC_029256.1,1,100,22524246,22524147,GGAGGGTGACAGAAGAGAGTGAGCACACGTGGTTGTTTCCTTGCAT...,GGAGGGTGACAGAAGAGAGTGAGCACACGTGGTTGTTTCCTTGCAT...,5.79e-45,181.0,...,0,100.0,1/-1,1,-1,minus,100,100,100,43270923
1,C446,NC_029256.1,2,95,4665998,4666093,GAGGGTGACAGAAGAGAGTGAGCACACGTGGTTGTTTCCTTGCATA...,GAGGCTGACAGAAGAGAGTGAGCACACATGGTGACTTTCTTGCATG...,2.31e-18,93.3,...,6,82.65,1/1,1,1,plus,100,94,100,43270923


In [61]:
threshold = 0
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(590, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C446,NC_029256.1,1,100,22524246,22524147,GGAGGGTGACAGAAGAGAGTGAGCACACGTGGTTGTTTCCTTGCAT...,GGAGGGTGACAGAAGAGAGTGAGCACACGTGGTTGTTTCCTTGCAT...,5.79e-45,181.0,...,100.0,1/-1,1,-1,minus,100,100,100,43270923,0
60,C167,NC_029256.1,1,176,4666341,4666516,TTGTCTTGAGAGGGGAAGAGATCTCTATGGGTTTTGGAGGTCTGAC...,TTGTCTTGAGAGGGGAAGAGATCTCTATGGGTTTTGGAGGTCTGAC...,6.479999999999999e-86,318.0,...,100.0,1/1,1,1,plus,100,100,176,43270923,0


In [62]:
# alignment length adjustment
def blastn_adjust(row):
    if(row['sstrand'] == "plus"):        
        row['sstart'] = max(1, row['sstart'] - (row['qstart'] - 1))
        row['send'] = min(row['slen'], row['send'] + (row['qlen'] - row['qend']))
    if(row['sstrand'] == "minus"):        
        row['send'] = max(1, row['send'] - (row['qstart'] - 1 ))
        row['sstart'] = min(row['slen'], row['sstart'] + (row['qlen'] - row['qend'] ))
    return row
    
df_blastn = df_blastn.apply(lambda row: blastn_adjust(row), axis=1)

In [63]:
# remore redundancy and hold best one base of Nonconformity value
df_blastn = df_blastn.sort_values(["Nonconformity", "evalue"], ascending = (True, True))
df_blastn = df_blastn.drop_duplicates(subset=['sseqid','sstart', 'qseqid', 'send','sstrand'], keep='first')
df_blastn.to_csv(f'./{temp_path}/filtered_out_blastn.csv')
print(df_blastn.shape)

(590, 28)


In [64]:
df_blastn = pd.merge(df_blastn, seq2cluster, how='inner', left_on='qseqid', right_on='cluster')
df_blastn = pd.merge(df_blastn, precursor, how='inner', left_on='seqid', right_on='tag')
print(df_blastn.shape)
df_blastn.head()

(727, 30)
(727, 36)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,slen,Nonconformity,seqid,cluster,data,hit_start,hit_end,hit,confidence,tag
0,C4,NC_029260.1,1,471,26049315,26049785,CAGAGTGTCTTCGCCAAAATGCCATCCCGAACAGAAATGCCACTTC...,CAGAGTGTCTTCGCCAAAATGCCATCCCGAACAGAAATGCCACTTC...,0.0,850.0,...,29958434,0,278,C4,cagagugucuucgccaaaaugccaucccgaacagaaaugccacuuc...,439,459,CGUCUGGGAUGGCAUUUUGGC,False,278
1,C14,NC_029257.1,1,374,8521746,8522119,CCATATCTCTAGTCTAATATGGTATCCGAGCTTATTGGTTAGTTGC...,CCATATCTCTAGTCTAATATGGTATCCGAGCTTATTGGTTAGTTGC...,0.0,675.0,...,35937250,0,319,C14,ccauaucucuagucuaauaugguauccgagcuuauugguuaguugc...,341,364,AGCUCUGAUACCAUGUUAGAUUAG,False,319
2,C19,NC_029266.1,1,360,2524994,2525353,CTTAGATTATTTCATAGTTCGGTTTGTAGAATACCATCTTCGAGTT...,CTTAGATTATTTCATAGTTCGGTTTGTAGAATACCATCTTCGAGTT...,0.0,650.0,...,29021106,0,435,C19,cuuagauuauuucauaguucgguuuguagaauaccaucuucgaguu...,331,351,UGGGGUUCUACAAACCGAACU,False,435
3,C18,NC_029259.1,1,361,28585568,28585208,AATCTTGGTTTAATGTATCATCCGATGTCCAAAGTGCAGATGACGC...,AATCTTGGTTTAATGTATCATCCGATGTCCAAAGTGCAGATGACGC...,0.0,652.0,...,35502694,0,441,C18,aaucuugguuuaauguaucauccgauguccaaagugcagaugacgc...,292,315,UUGCAUCCUCUGCACUUUGGGCCU,False,441
4,C20,NC_029267.1,1,360,4867257,4867616,CAATAGCAATGCATCAGTTACGTTTCCTACATAGTTACATGGTATC...,CAATAGCAATGCATCAGTTACGTTTCCTACATAGTTACATGGTATC...,0.0,650.0,...,27531856,0,450,C20,caauagcaaugcaucaguuacguuuccuacauaguuacaugguauc...,329,351,AGAGACUUGGCUGAUGCAUUACU,False,450


# Result of the blastn to bed file

In [131]:
flanking_value = 20
df = df_blastn[['qseqid', 'sseqid', 'sstart', 'send', 'sstrand','slen', 'hit_start', 'hit_end']]
df['ones'] = 1
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ones'] = 1


Unnamed: 0,qseqid,sseqid,sstart,send,sstrand,slen,hit_start,hit_end,ones
0,C4,NC_029260.1,26049315,26049785,plus,29958434,439,459,1
1,C14,NC_029257.1,8521746,8522119,plus,35937250,341,364,1


In [132]:
def switch(row):
    if(row['sstart'] > row['send']):        
        temp = row['sstart']
        row['sstart'] = row['send']
        row['send'] = temp
    return row
df = df.apply(lambda row: switch(row), axis=1)

In [133]:
def convert(inp):
    if(inp == "plus"):
        return "forward"
    if(inp == "minus"):
        return "reverse"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['strand'] = df['sstrand'].apply(lambda x: convert(x))

In [134]:
def convert2sign(inp):
    if(inp == "plus"):
        return "+"
    if(inp == "minus"):
        return "-"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['sign'] = df['sstrand'].apply(lambda x: convert2sign(x))

In [135]:
df['hit_length'] = df.apply(lambda row: abs(row['send'] - row['sstart']) + 1 ,axis=1)

## convert sstart and send from location to index (range)

In [136]:
df['sstart'] = df['sstart'].apply(lambda x: x - 1)

In [137]:
df['downstream_flanking'] = df['sstart'].apply(lambda x: flanking_value if x > flanking_value else x)

In [138]:
df['upstream_flanking'] = df.apply(lambda row: flanking_value if (row['send']+flanking_value) <= row['slen'] else row['slen'] - row['send'],axis=1)

In [139]:
df['hit_start'] = df.apply(lambda row: row['downstream_flanking'] + row['hit_start'] -1 if row['sign'] == "+" else row['upstream_flanking'] + row['hit_start'] -1, axis=1)
df['hit_end'] = df.apply(lambda row: row['downstream_flanking'] + row['hit_end'] if row['sign'] == "+" else row['upstream_flanking'] + row['hit_end'],axis=1)

In [140]:
df['sstart'] = df['sstart'].apply(lambda x: max(x - flanking_value, 0))
df['send'] = df.apply(lambda row: min(row['send'] + flanking_value , row['slen']),axis=1)

In [141]:
df['tag'] = df.apply(lambda row: f">{row['sseqid']}:{row['sstart']}-{row['send']}({row['sign']})",axis=1)
df['reformated_tag'] = df['tag'].apply(lambda t: reformat(t))
df[['tag', 'reformated_tag', 'hit_start', 'hit_end']].to_csv(f'./{temp_path}/hit_index_info.csv')#, index=False)

In [142]:
df['location_tag'] = df.apply(lambda row: f">{row['sseqid']}|{row['sign']}|{row['sstart'] + 1}-{row['send']}|{row['hit_start']+1}-{row['hit_end']}",axis=1)
df[['location_tag','qseqid']].to_csv(f'./{temp_path}/pipe_seprated_location_list.csv',index=False,sep='\t')

In [143]:
df[['sseqid','sstart','send','strand','ones', 'sign']].to_csv(f'./{temp_path}/extension_index.bed', 
        index=False, header=False, sep="\t")

# Extention

In [144]:
!bedtools getfasta -fi {input_genome_path} -fo ./{temp_path}/extended_original.txt -s -bed ./{temp_path}/extension_index.bed
!rm input_genome.fna.fai

rm: cannot remove 'input_genome.fna.fai': No such file or directory


# Convert hit region to upper case and other region to lower case

In [145]:
ext = fasta_to_df(f'./{temp_path}/extended_original.txt')
info = pd.read_csv(f'./{temp_path}/hit_index_info.csv')
info['tag'] = info['tag'].apply(lambda x: x[1:])
print(info.shape)
info.head(2)

(727, 5)


Unnamed: 0.1,Unnamed: 0,tag,reformated_tag,hit_start,hit_end
0,0,NC_029260.1:26049294-26049805(+),>NC_0292601_26049294-26049805_+_,458,479
1,1,NC_029257.1:8521725-8522139(+),>NC_0292571_8521725-8522139_+_,360,384


In [146]:
ext = ext.sort_values(by=['tag']).reset_index()
ext['help_tag'] = ext.apply(lambda r: r['tag'] + str(r.name),axis=1)
del ext['tag']

info = info.sort_values(by=['tag']).reset_index()
info['help_tag'] = info.apply(lambda row: row['tag']+ str(row.name),axis=1)
def redefined_tag(row):
    tag = row['tag']
    [sstart, send] = tag.split(':')[-1].split('(')[0].split('-')
    sstart = int(sstart) + 1
    sign = tag.split('(')[-1].split(')')[0]    
    return f"{tag.split(':')[0]}|{sign}|{sstart}-{send}|{row['hit_start']+1}-{row['hit_end']}"
info['tag'] = info.apply(lambda row: redefined_tag(row),axis=1)
ext = pd.merge(ext,info,how='inner', on='help_tag')

def emphasis_hit(row):
    seq = list(row['data'].lower())            
    s = row['hit_start']
    e = row['hit_end']
    seq[s:e] = list(''.join(seq[s:e]).upper())    
    return ''.join(seq)
    
ext['data'] = ext.apply(lambda row: emphasis_hit(row),axis=1)
df_to_fasta(ext[['tag','data']],f"./{temp_path}/extended_modified.txt")

# Protein coding elimination [Download nr]

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz

# Protein coding elimination [Diamond]

In [57]:
#!wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
#!tar xzf diamond-linux64.tar.gz

In [None]:
!./diamond makedb --in ./NR/nr -d ./Temp/diamond_output

In [93]:
!./diamond blastx -d ./Temp/diamond_output.dmnd \
                  -q ./Temp/extended_modified.txt \
                  -o ./Temp/diamond_matches.tsv \
                  -p 22

/bin/bash: ./diamond: No such file or directory


In [158]:
dmn = pd.read_csv(f"./{temp_path}/diamond_matches.tsv", sep='\t', header=None)
dmn.columns = 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'.split(' ')
coding_seq = dmn['qseqid'].unique()

In [159]:
def clear(inp):
    if(inp[:9] == "reverse::"):
        return inp[9:]
    if(inp[:9] == "forward::"):
        return inp[9:]
    return inp
coding_seq = pd.Series(coding_seq).apply(lambda x : clear(x))

ext = fasta_to_df(f'./{temp_path}/extended_modified.txt')
print(f'total:      {ext.shape[0]}')
non_coding = ext[~ext['tag'].isin(coding_seq)]
print(f'non_coding: {non_coding.shape[0]}')
df_to_fasta(non_coding,f'./{temp_path}/extended_modified_non_coding.txt')
coding = ext[ext['tag'].isin(coding_seq)]
print(f'coding:     {coding.shape[0]}')
df_to_fasta(coding,f'./{temp_path}/extended_modified_coding.txt')

total:      727
non_coding: 486
coding:     241


# RNA 2d prediction

## Mfold

In [160]:
counter = 0
base = f"{result_path}/secondary_structure/mfold/"
!rm -r {base}
!mkdir -p {base}
df = fasta_to_df(f'./{temp_path}/extended_modified_non_coding.txt')

for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)            
    with open(base + f"{tag}/SEQ.FASTA",'w') as file:
        file.write(f">{row['tag']}\n{row['data']}")
    counter += 1    
    #if(counter >= 100):
      #  break

rm: cannot remove './Experiment/O.sativa_plus/Result/secondary_structure/mfold/': No such file or directory


In [161]:
%%capture
remove_lock = False
def run_mfold(tag):
    tag = reformat(tag)
    %cd {base + tag}
    !mfold  SEQ="SEQ.FASTA" T=22
    if(not remove_lock):
        !find . -name "SEQ*" -not -name "*.ct" -not -name "*.pdf" -not -name "*SEQ.FASTA" -not -type d -delete
    %cd {current_path}

if __name__ == '__main__':        
    pool = mp.Pool(mp.cpu_count() - 3)      
    pool.map(run_mfold, df['tag'])  

## Mxfold2

In [None]:
#!wget https://github.com/keio-bioinformatics/mxfold2/releases/download/v0.1.1/mxfold2-0.1.1.tar.gz
#!pip3 install mxfold2-0.1.1.tar.gz
#!rm mxfold2-0.1.1.tar.gz

In [None]:
!mxfold2 predict ./extended.txt > Result/secondary_structure/mxfold2_result.txt

In [None]:
df = fasta_to_df('./Result/secondary_structure/mxfold2_result.txt')
df = df.apply(lambda row: bracket_row(row) , axis=1)
df.head(2)

In [None]:
base = "./Result/secondary_structure/mxfold2/"
!rm -r {base}
!mkdir -p {base}
for index, row in df.iterrows():    
    if(not os.path.exists(base + reformat(row['tag']))):
        os.makedirs(base + reformat(row['tag']))        
    tag = reformat(row['tag'])
    with open(base + f"{tag}/{tag}.ct",'w') as file:
        bracket = row['bracket'].split(' ')[0]
        deltaG = row['bracket'].split(' ')[1]
        ct = bracket_to_ct(row['tag'], row['data'], bracket, deltaG)
        file.write(ct)    

## Vienna package

In [None]:
#!wget https://www.tbi.univie.ac.at/RNA/download/ubuntu/ubuntu_20_04/viennarna_2.4.18-1_amd64.deb -O viennarna.deb
#!sudo dpkg -i ./viennarna.deb
#!sudo apt-get -f install
#!rm viennarna.deb

In [96]:
base = f"{result_path}/secondary_structure/viennarna/"
!rm -r {base}
!rm {result_path}/secondary_structure/viennarna_result.txt
!mkdir -p {base}

rm: cannot remove 'Experiment/A.thaliana/Result/secondary_structure/viennarna/': No such file or directory
rm: cannot remove 'Experiment/A.thaliana/Result/secondary_structure/viennarna_result.txt': No such file or directory


In [102]:
#%cd {base}
!RNAfold --jobs=0 --infile {current_path}/{temp_path}/extended_modified.txt  --noPS -T 22 > {current_path}/{base}/viennarna_result.txt
#%cd {current_path}

In [107]:
df = fasta_to_df(f'{result_path}/secondary_structure/viennarna/viennarna_result.txt')
df = df.apply(lambda row: bracket_row(row) , axis=1)
print(df.shape)
df.head(2)

(17793, 3)


Unnamed: 0,tag,data,bracket
0,NC_000932.1|+|101567-101986|201-220,aaucccagggcucaacccuggacaggcgguggaaacuaccaagcuu...,.(((((((((.....)))))).((((((((((...)))))..))))...
1,NC_000932.1|+|105092-105512|201-221,aaucccgugugaaucagcaaggaccaccuugcaaggcuaaauacuc...,.....(((((......((((((....))))))....(((..(((((...


In [108]:
for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)      
    with open(base + f"{tag}/{tag}.ct",'w') as file:
        bracket = row['bracket'].split(' ')[0]
        deltaG = row['bracket'].split(' ')[1]
        ct = bracket_to_ct(row['tag'], row['data'], bracket, deltaG, False)
        file.write(ct)    

In [109]:
import glob
for file in glob.glob(f"{base}*.ps"):    
    f = file[len(base):-6] # _ss.ps 
    f = reformat(f)        
    shutil.move(file, f"{base}{f}/{f}.ps")    

## ContraFold

In [None]:
#!wget http://contra.stanford.edu/contrafold/contrafold_v2_02.tar.gz
#!tar -xvzf contrafold_v2_02.tar.gz && rm contrafold_v2_02.tar.gz
#%cd contrafold/src
#!make clean
#!make 
# to file must changed to be complieable # utility.hpp and optimization.c++ files

In [None]:
counter = 0
base = f"./{result_path}/secondary_structure/contrafold/"
!rm -r {base}
!mkdir -p {base}
df = fasta_to_df(f'./{temp_path}/extended.txt')

for index, row in tqdm(df.iterrows()):    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)            
    with open(base + f"{tag}/{tag}.FASTA",'w') as file:
        file.write(f">{row['tag']}\n{row['data']}")
    counter += 1        

In [None]:
def run_contrafold(tag):
    tag = reformat(tag)    
    %cd Software/contrafold/src
    !./contrafold predict ../..{base[1:]}{tag}/{tag}.FASTA > ../..{base[1:]}{tag}/{tag}.dot
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'r') as file:
        text = file.read()
    text = [l for l in text.split("\n") if l[:len(">structure")] != ">structure"]    
    header = text[0]
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'w') as file:
        file.write('\n'.join(text[1:]))    
    !RNAeval  ../..{base[1:]}{tag}/{tag}.dot -T 20 > ../..{base[1:]}{tag}/{tag}.dotdg    
    with open(f"../..{base[1:]}{tag}/{tag}.dotdg", 'r') as file:
        text = file.read()
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'w') as file:
        file.write(header + "\n" + text)    
    
    df = fasta_to_df(f'../..{base[1:]}{tag}/{tag}.dot')
    df = df.apply(lambda row: bracket_row(row) , axis=1)        
    tag = reformat(df['tag'][0])
    with open(f'../..{base[1:]}{tag}/{tag}.ct','w') as file:
        bracket = df['bracket'][0].split(' ')[0]        
        deltaG = df['bracket'][0].split(' ')[1]
        ct = bracket_to_ct(df['tag'][0], df['data'][0], bracket, deltaG, False)
        file.write(ct)    
    #!rm ../..{base[1:]}{tag}/{tag}.dot
    #!rm ../..{base[1:]}{tag}/{tag}.dotdg
    !rm ../..{base[1:]}{tag}/{tag}.FASTA
    %cd {current_path}        

if __name__ == '__main__':        
    pool = mp.Pool(mp.cpu_count() - 1)  
    pool.map(run_contrafold, df['tag'].iloc[:10])

In [None]:
'''path = 'secondary_structure/contrafold/AMWY020333941_469-893_-_/AMWY020333941_469-893_-_.dot'
!RNAeval  {path} -T 20 -v'''; 

# CTAnalizer

In [6]:
# only select those not ran before
ss_method = "mfold"
base = f"{result_path}/secondary_structure/{ss_method}/"
df = fasta_to_df(f'./{temp_path}/extended_modified_non_coding.txt')
index_list =[]
for index, row in df.iterrows():    
    tag = reformat(row['tag'])    
    if(len(glob.glob(f'{base + tag}/*.ct')) != 0):
        index_list.append(index)
df = df.iloc[index_list,:]
print(df.shape)

(486, 2)


In [7]:
def run(tag, path, extra):             
    return get_row(tag, path,extra)
    try:
        return get_row(tag, path,extra)
    except Exception as e:
        print(str(e), tag)                
        return pd.Series()
        
def get_df_by_tag(tag , extra=0):           
    ct_files = glob.glob(f'{base}{reformat(tag)}/*.ct')    
    return pd.Series(ct_files).apply(lambda path: run(tag, path,extra))    

In [8]:
get_df_by_tag("NC_029257.1|-|3280761-3280918|31-51", extra=0)
#d.to_csv('Result/d.csv')#.iloc[0,:]['full seq visualization']x

<class 'pandas.core.series.Series'> <class 'int'> <class 'int'>
<class 'pandas.core.series.Series'> <class 'int'> <class 'int'>


TypeError: '<' not supported between instances of 'int' and 'str'

# Apply on current data

In [185]:
seq2cluster = pd.read_csv(f"{temp_path}/seq2cluster.csv")
seq2cluster['tag'] = seq2cluster['tag'].apply(lambda x: str(x))
seq2cluster['tag'] = seq2cluster.groupby(['cluster'])['tag'].transform(lambda x: ','.join(x))
seq2cluster['seqid'] = seq2cluster['seqid'].apply(lambda x: str(x))
seq2cluster['seqid'] = seq2cluster.groupby(['cluster'])['seqid'].transform(lambda x: ','.join(x))
seq2cluster = seq2cluster.drop_duplicates()
tag2cluster = pd.read_csv(f'./{temp_path}/pipe_seprated_location_list.csv',sep='\t')
tag2cluster['location_tag'] = tag2cluster['location_tag'].apply(lambda x : x[1:])
data = pd.merge(seq2cluster,tag2cluster,how='inner', left_on='cluster', right_on='qseqid')
data['Reference miRNA cluster'] = data['cluster']
data['Reference miRNA IDs'] = data['seqid']
data['Reference miRNA IDs and species'] = data['tag']
data = data[['location_tag','Reference miRNA cluster', 'Reference miRNA IDs', 'Reference miRNA IDs and species','confidence']]

In [186]:
rcols_ref = ['Reference miRNA cluster',
             'Reference miRNA IDs',
             'Reference miRNA IDs and species']
rcols_boi = ['boi seq', 'boi name', 'boi dotbracket']

rcols = [*rcols_ref,
         *rcols_boi]

rcols_dg = [*rcols, 'delta G']

def selection(row):        
    global repeted
    tuple_row = tuple(row)            
    if(tuple_row not in repeted):
        repeted[tuple_row] = row.name
        return True
    return False

def boi_selection(row):    
    global repeted_boi        
    tuple_row_boi = tuple(row[rcols_boi])
    dg = row['delta G']
    if(tuple_row_boi not in repeted_boi):
        repeted_boi[tuple_row_boi] = { 
            "counter": 1,
            "dg": dg,
            "ref clusters": row['Reference miRNA cluster'],
            "ref ids": row['Reference miRNA IDs'],
            "ref species": row['Reference miRNA IDs and species'],
            "lock": False
        }
    else:            
        value =  repeted_boi[tuple_row_boi]
        value['counter'] += 1
        value['dg'] = min(value['dg'] , dg)
        value['ref clusters'] += "," + row['Reference miRNA cluster']
        value['ref ids'] += "," + row['Reference miRNA IDs']
        value['ref species'] += "," +  row['Reference miRNA IDs and species']                
        repeted_boi[tuple_row_boi] = value        

In [187]:
!rm ./{result_path}/ct_analizer.csv
chunksize = 1 * (10 ** 4)
max_workers = mp.cpu_count() - 4
num_terminal = 5 # acceptable_terminal_structures

repeted = {}
repeted_boi = {}
header = True
orders = None
arr = np.array_split(df['tag'], max(df['tag'].shape[0]//chunksize , 1))
for chunk in tqdm(arr):
    dfs = []
    for row in process_map(get_df_by_tag , chunk, tqdm_class=tqdm, max_workers=max_workers, chunksize=5):        
        dfs.append(row)
    chunk = pd.concat(dfs,axis=0)
    chunk = pd.merge(data, chunk, how='right', left_on = 'location_tag', right_on ='seq name')
    del chunk['location_tag']
    if(header):
        orders = chunk.columns
    for col in orders:
        if(col not in chunk.columns):
            chunk[col] = np.nan            
        
    for col in chunk.columns:
        if(col not in orders ):
            print(f"Error in {col}")        
    chunk = chunk.reindex(columns=orders)
    chunk = chunk.replace(np.nan, '-').replace('', '-')                    
    # delete repeated
    selected = chunk[rcols].apply(lambda row: selection(row), axis=1)
    chunk = chunk[selected]
    
    # cluster refs
    chunk[rcols_dg].apply(lambda row: boi_selection(row), axis=1)        
    chunk.to_csv(f"./{result_path}/ct_analizer.csv", header=header, mode='a', index=False)    
    header = False

rm: cannot remove '././Experiment/O.sativa_plus/Result/ct_analizer.csv': No such file or directory


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/486 [00:00<?, ?it/s]

'<=' not supported between instances of 'str' and 'int' '<=' not supported between instances of 'str' and 'int'NC_029257.1|-|3280761-3280918|31-51'<=' not supported between instances of 'str' and 'int''<=' not supported between instances of 'str' and 'int''<=' not supported between instances of 'str' and 'int' 
'<=' not supported between instances of 'str' and 'int'  NC_029256.1|+|9055583-9055799|166-187 NC_029256.1|-|2317258-2317415|31-51
NC_029257.1|+|20716154-20716300|43-64 NC_029257.1|+|34890431-34890707|171-194
NC_029256.1|-|38764269-38764411|27-48


'<=' not supported between instances of 'str' and 'int' NC_029256.1|+|26250141-26250578|334-357
'<=' not supported between instances of 'str' and 'int' NC_029256.1|-|30477592-30477749|108-128
'<=' not supported between instances of 'str' and 'int''<=' not supported between instances of 'str' and 'int'  NC_029256.1|+|16373408-16373567|116-139NC_029256.1|+|20332858-20333007|37-57
'<=' not supported between instances of 'str' and 'int'
 

NameError: name 'asa' is not defined

In [None]:
!rm ./{result_path}/ct_analizer_clustered.csv
def isKeepCluster(row):
    global repeted_boi    
    dg = row['delta G']    
    if(row["boi name"] == "-"):
        return False
    tuple_row_boi = tuple(row[rcols_boi])    
    value = repeted_boi[tuple_row_boi]
    if(value['counter'] == 1):
        return True
    if(value['dg'] != dg):        
        return False
    if(value['lock']):
        return False
    value['lock'] = True
    repeted_boi[tuple_row_boi] = value
    return True


def makeCluster(row):            
    tuple_row_boi = tuple(row[rcols_boi])    
    value = repeted_boi[tuple_row_boi]
    if(value['counter'] != 1):
        for ref_c in ['ref clusters', 'ref ids', 'ref species']:
            value[ref_c] = value[ref_c].replace(' ,', ',').replace(', ', ',')                        
            value[ref_c] = value[ref_c].split(',')
            value[ref_c] = set(value[ref_c])
            value[ref_c] = ", ".join(value[ref_c])                    
        row['Reference miRNA cluster'] = value["ref clusters"]
        row['Reference miRNA IDs'] = value["ref ids"]
        row['Reference miRNA IDs and species'] = value["ref species"]
    return row


header = True
for chunk in tqdm(pd.read_csv(f"./{result_path}/ct_analizer.csv", chunksize=10 ** 5)):
    chunk = chunk[chunk[rcols_dg].apply(lambda row:  isKeepCluster(row), axis=1)]
    chunk = chunk.apply(lambda row : makeCluster(row), axis=1)
    chunk.to_csv(f"./{result_path}/ct_analizer_clustered.csv", mode='a', index=False)
    header = False

# Filters

In [133]:
!rm ./{result_path}/result_level1_filter.csv
filter1_run(input_file=  f"./{result_path}/ct_analizer_clustered.csv",
            output_file= f"./{result_path}/result_level1_filter.csv")

rm: cannot remove '././Experiment/A.thaliana/Result/result_level1_filter.csv': No such file or directory


1it [00:01,  1.20s/it]


In [137]:
config = {'delta_g_min': -999,
          'delta_g_max': 1,
          'hit_len_min': 21,
          'hit_len_max': 21,
          'hit_complementarity_percentage_min': 0.5,
          'hit_complementarity_percentage_max': 1.0,
          'number_of_terminal_structure_min': 0,
          'number_of_terminal_structure_max': 5,
          'boi_gc_content_min': 45,
          'boi_gc_content_max': 94, 
          'num_of_linking_residues_min': 5, 
          'num_of_linking_residues_max': 159,
          'hit_gc_content_percentage_min': 37, 
          'hit_gc_content_percentage_max': 86,
          'precursor_mfei_min': 0.87,
          'precursor_mfei_max': 1.3695556794836854, 
          'border_line_mismatch_max': 0,
          'border_line_bulge_max': 0,
          'border_line_internal_max': 0,
          'total_num_of_nonmatching_positions': 5,
          'total_num_of_mismached_positions': 5,
          'total_num_of_positions_in_bulges_and_loops': 2,
          'max_allowed_mismatch_size_in_hit_region': 2,
          'max_allowed_bulge_size_in_hit_region': 1,
          'max_allowed_internal_loop_size_in_hit_region': 3,
          'max_allowed_hsbl_ssbl_size': 2,
          'minimum_required_clear_region': 0,
          'acceptable_num_for_hit_locations_in_bulges_or_loops': 2,
          'acceptable_num_for_unmatched_locations_in_hit_region': 5,
          'delete_if_mature_duplex_involvement_in_apical_loop': 'YES',
          'border_line_structure_allowance': 'NOT ACCEPTED'}

filter2(input_file  = f"./{result_path}/result_level1_filter.csv",
            output_file = f"./{result_path}/result_level2_filter.csv",
            config=config)

1it [00:04,  4.22s/it]


# Cluster JSC

In [20]:
result = pd.read_csv(f"./{result_path}/result_level2_filter.csv")
print(result.shape)
result.head(2)

(9946, 134)


Unnamed: 0,Reference miRNA cluster,Reference miRNA IDs,Reference miRNA IDs and species,confidence,seq name,ct name,ct,pdf,hit start,hit end,...,proximal closest to 17,proximal closest to 21,proximal closest to 36,distal closest to 15,distal closest to 17,distal closest to 21,distal closest to 36,Loop distal junction distance,Loop proximal junction distance,message
0,C154,mtr-miR5205b,mtr-miR5205b MIMAT0021134 Medicago truncatula ...,False,NC_029256.1|-|1012887-1013310|201-224,Fold 08,"=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...","=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...",201,224,...,"['loop=dist:15, size:3 + 2']","['loop=dist:15, size:3 + 2']","['loop=dist:15, size:3 + 2']",-,-,-,-,1.0,15.0,-
1,C145,hvu-miR1120,hvu-miR1120 MIMAT0018499 Hordeum vulgare miR1120,False,NC_029256.1|-|1012889-1013312|201-224,Fold 08,"=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...","=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...",201,224,...,"['loop=dist:13, size:3 + 2', 'loop=dist:17, si...","['loop=dist:13, size:3 + 2', 'loop=dist:17, si...","['loop=dist:13, size:3 + 2', 'loop=dist:17, si...",-,-,-,-,3.0,13.0,-


In [21]:
def jaccard(A, B):
    a1 = int(A.split('-')[0])
    a2 = int(A.split('-')[1])
    b1 = int(B.split('-')[0])
    b2 = int(B.split('-')[1])
    s1 = set([i for i in range(min(a1,a2), max(a1,a2) + 1)])
    s2 = set([i for i in range(min(b1,b2), max(b1,b2) + 1)])    
    intersection = len(s1.intersection(s2))
    union = (len(s1) + len(s2)) - intersection
    return float(intersection) / union

In [22]:
def same2cluster(same_dict, threshold=0.8):        
    counter = 1
    for key in same_dict:        
        item2cluster = {}
        SET = list(set(same_dict[key]))
        G = networkx.Graph()
        # add nodes
        for s in SET:
            G.add_node(s)
        # add edges
        for i in range(0, len(SET)):
            for j in range(i+1, len(SET)):
                if(jaccard(SET[i],SET[j]) >= threshold):        
                    G.add_edge(SET[i], SET[j], weight=jaccard(SET[i],SET[j]))
                            
        # get maximal
    
        for clique in maximal_cliques(G):    
            unique = str(counter).zfill(4)
            counter += 1
            for item in clique:
                if(item in item2cluster):
                    item2cluster[item].append(unique)
                else:
                    item2cluster[item] = [unique]    
        same_dict[key] = item2cluster
    return same_dict 

## hit jaccard similarity

In [23]:
hit_threshold = 0.8
same_strand_hit = {}

def same_strand(row):    
    global same_strand_hit
    chrom = row['chromosome']
    sign = row['sign']
    hit = row['hit position on chromosome']      
    key = f'{chrom}{sign}'
    if(key in same_strand_hit):        
        same_strand_hit[key].append(hit)
    else:
        same_strand_hit[key] = [hit]
        
rcols_hit = ['chromosome', 'sign', 'hit position on chromosome']
result[rcols_hit].apply(lambda row: same_strand(row), axis=1)
same_strand_hit = same2cluster(same_strand_hit, threshold=hit_threshold)

def _f(row):        
    key = f"{row['chromosome']}{row['sign']}"
    return same_strand_hit[f'{key}'][row['hit position on chromosome']]
result['hit cluster number'] = result[rcols_hit].apply(lambda row: _f(row),axis=1)

In [24]:
rcols_boi = ['boi seq', 'boi name', 'boi dotbracket']
boi_threshold = 0.8
same_strand_boi = {}

def same_strand(row):    
    global same_strand_boi    
    seq = row['boi seq'].lower()
    name = row['boi name']
    dotbracket = row['boi dotbracket'].lower()
    if(pd.isna(name) or name == "-"):
        return 
    name = name.split('|')    
    loction = name[0] + name[1] + name[2]
    hit = name[3]
    key = f'{seq}{dotbracket}{loction}'
    if(key in same_strand_boi):        
        same_strand_boi[key].append(hit)
    else:
        same_strand_boi[key] = [hit]
        
        
result[rcols_boi].apply(lambda row: same_strand(row), axis=1)

same_strand_boi = same2cluster(same_strand_boi, threshold=boi_threshold)

def _f(row):        
    seq = row['boi seq'].lower()
    name = row['boi name']
    dotbracket = row['boi dotbracket'].lower()
    if(pd.isna(name) or name == "-"):
        return 
    name = name.split('|')    
    loction = name[0] + name[1] + name[2]
    hit = name[3]
    key = f'{seq}{dotbracket}{loction}'        
    return same_strand_boi[f'{key}'][hit]
result['boi cluster number'] = result[rcols_boi].apply(lambda row: _f(row),axis=1)

In [25]:
precursor_threshold = 0.8
same_strand_precursor = {}
rcols_pre = ['precursor seq', 'precursor name', 'precursor dotbracket']

def same_strand(row):    
    global same_strand_precursor    
    seq = row['precursor seq'].lower()
    name = row['precursor name']
    dotbracket = row['precursor dotbracket'].lower()
    if(pd.isna(name) or name == "-"):
        return 
    name = name.split('|')    
    loction = name[0] + name[1] + name[2]
    hit = name[3]
    key = f'{seq}{dotbracket}{loction}'
    if(key in same_strand_precursor):        
        same_strand_precursor[key].append(hit)
    else:
        same_strand_precursor[key] = [hit]
        
        
result[rcols_pre].apply(lambda row: same_strand(row), axis=1)
same_strand_precursor = same2cluster(same_strand_precursor, threshold=precursor_threshold)

def _f(row):        
    seq = row['precursor seq'].lower()
    name = row['precursor name']
    dotbracket = row['precursor dotbracket'].lower()
    if(pd.isna(name) or name == "-"):
        return 
    name = name.split('|')    
    loction = name[0] + name[1] + name[2]
    hit = name[3]
    key = f'{seq}{dotbracket}{loction}'
    return same_strand_precursor[f'{key}'][hit]
result['precursor cluster number'] = result[rcols_pre].apply(lambda row: _f(row),axis=1)

In [26]:
hit2cluster = {}
hit_unique = result['hit seq'].unique()
for i in range(0, hit_unique.shape[0]):
    hit2cluster[hit_unique[i]] = str(i + 1).zfill(4)
result['identical hit cluster'] = result['hit seq'].apply(lambda hit: hit2cluster[hit])

In [27]:
seed_start = 2
seed_end = 13
result['seed region'] = result['hit seq'].apply(lambda hit: hit[seed_start-1:seed_end])

In [28]:
result.to_csv(f"./{result_path}/result_level2_filter_clustered.csv",index=False)
!zip -r ./{result_path}/result_level2_filter_clustered.zip ./{result_path}/result_level2_filter_clustered.csv

  adding: Experiment/O.sativa/Result/result_level2_filter_clustered.csv (deflated 95%)
