# Common

In [44]:
#!pip install tqdm

In [63]:
import math
import numpy as np
import pandas as pd
import requests
from tqdm.contrib.concurrent import process_map
from tqdm.notebook import tqdm
tqdm.pandas()
import multiprocessing as mp
import shutil
import urllib.parse
import glob
import os
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [3]:
!mkdir -p Temp

In [4]:
def bracket_row(row):    
    s = row['data']
    index = min(s.find('.'), s.find('('))
    data = row['data']
    row['data'] = data[0:index]
    row['bracket'] = data[index:]
    return row

In [5]:
def adjust(text,n=7):
    text = str(text)    
    return " " * (n - len(text)) + text

In [6]:
def bracket_to_ct(tag, data, bracket, deltaG, negative_deltaG=True):    
    deltaG = deltaG.replace('(','').replace(')','')
    deltaG = float(deltaG)
    if(deltaG > 0 and negative_deltaG ): # negetive?!
        deltaG = -1 * deltaG
    stack = []
    index = np.zeros((len(bracket)), dtype = int)
    values = np.zeros((len(bracket)), dtype = int)
    for i in range(len(bracket)):
        index[i] = i + 1
        if(bracket[i] == '.'):
            values[i] = 0
        elif(bracket[i] == '('):
            stack.append(i)
        elif(bracket[i] == ')'):
            if(len(stack) == 0 ):
                print('structure error!')
            values[stack[-1]] = i + 1
            values[i]  = stack[-1] + 1
            stack.pop()
        else:
            print('structure error!')
    if(len(stack) != 0 ):
        print('structure error!')
    # body    
    ct = f"{adjust(len(data),6)} dG ={adjust(deltaG,10)} {tag}\n"   
    for i in range(len(bracket)):
        ct += f"{adjust(index[i],6)} {data[i]} {adjust(i,6)} {adjust((i+2)%(len(data)+1),6)} {adjust(values[i],6)} {adjust(index[i],7)}\n"
    return ct

In [7]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [8]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [9]:
def reformat(path):
    return path.replace('(','_').replace(')','_').replace('.','').replace(':','_')

In [10]:
def reformatCT(path):
    with open(path, 'r') as file:
        text = file.read()
    text = [l for l in text.split('\n') if len(l) > 0 ] # remove blank lines
    text = '\n'.join(text)
    text = text.replace("\t"," ")
    while("  " in text):
        text = text.replace("  ", " ")
    lines = [l for l in text.split('\n')]
    for i in range(len(lines)):
        if(lines[i][0] == " "):
            lines[i] = lines[i][1:]
        if(lines[i][-1] == " "):
            lines[i] = lines[i][:-1]
    text = '\n'.join(lines)
    return text

In [11]:
def get_ct_data(ct):
    ct = "\n".join(ct.split('\n')[1:])
    df = pd.read_csv(StringIO(ct), sep=" ", header=None)               
    nucleotide = df.iloc[:,1]
    index = df.iloc[:,5]
    values = df.iloc[:,4]
    return [nucleotide, index, values]

In [12]:
def ct2dot_bracket(path):
    [nucleotide, index, values] = get_ct_data(reformatCT(path))
    text = ''.join(nucleotide) + "\n"
    watch = []
    for i, v in zip(index,values):
        if(v == 0):
            text += '.'
        else:
            if( v not in watch):
                text += '('
                watch.append(i)
            if( v in watch):
                text += ')'
    return text

In [13]:
def is_nested(index, values):
    max_value = max(index) + 10 # inf
    for i, v in zip(index, values):
        if(v < max_value and v != 0):
            max_value  = v
        if(i >= max_value):
            max_value = max(index) + 10 # inf
        if(v > max_value):
            return False               
    return True

In [14]:
'''ct = reformatCT('./secondary_structure/spot_rna/AMWY020598281_2832-3256_+_/AMWY020598281_2832-3256_+_.ct')
[nucleotide, index, values] = get_ct_data(ct)
print(is_nested( index,  values))
''';

### rename tag of input genome to new tag id

# Download dataset

In [None]:
'''
from Bio import Entrez
Entrez.email = "abolhasani.eliya@gmail.com"     
with Entrez.esearch(db='nucleotide', term="Arabidopsis thaliana") as handle:
    result = Entrez.read(handle)

print(result)
genome_ids = result['IdList']

for genome_id in genome_ids:
    print(genome_id)
    record = Entrez.efetch(db="nucleotide", id=genome_id, rettype="fasta", retmode="text")        
    with open(f'{genome_id}.fasta', 'w') as f:
        f.write(record.read())
    break
''';
'''
from Bio import Entrez
Entrez.email = "abolhasani.eliya@gmail.com"     
record = Entrez.efetch(db="nucleotide", id="NC_054143.4", rettype="fasta", retmode="text")        
with open(f'data.fasta', 'w') as f:
    f.write(record.read())
''';

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/439/995/GCA_000439995.3_AzaInd2.1/GCA_000439995.3_AzaInd2.1_genomic.fna.gz

In [None]:
!gzip -d ./GCA_000439995.3_AzaInd2.1_genomic.fna.gz

# Download data from Mirbase

In [15]:
directory = 'miRBase_driven_data'

In [None]:
base = "https://www.mirbase.org/ftp/CURRENT"        
!rm -r {directory}
!mkdir -p {directory}
!wget {base}/aliases.txt.gz -P ./{directory}/       ; gzip -d ./{directory}/aliases.txt.gz 
!wget {base}/hairpin.fa.gz -P ./{directory}/           ; gzip -d ./{directory}/hairpin.fa.gz 
!wget {base}/hairpin_high_conf.fa.gz -P ./{directory}/ ; gzip -d ./{directory}/hairpin_high_conf.fa.gz 
!wget {base}/mature.fa.gz -P ./{directory}/            ; gzip -d ./{directory}/mature.fa.gz 
!wget {base}/mature_high_conf.fa.gz -P ./{directory}/  ; gzip -d ./{directory}/mature_high_conf.fa.gz
!wget {base}/miRNA.str.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.str.gz 
!wget {base}/miRNA.xls.gz -P ./{directory}/            ; gzip -d ./{directory}/miRNA.xls.gz 
!wget {base}/organisms.txt.gz -P ./{directory}/        ; gzip -d ./{directory}/organisms.txt.gz

In [16]:
df = fasta_to_df(f'./{directory}/mature.fa')
#df = fasta_to_df(f'./{directory}/hairpin_high_conf.fa')
#df = fasta_to_df('./Data/mature_high_conf.fa')
df['organism'] = df['tag'].apply(lambda x: x[:3])
print(df.shape)
df.head(2)

(48885, 3)


Unnamed: 0,tag,data,organism
0,cel-let-7-5p MIMAT0000001 Caenorhabditis elega...,UGAGGUAGUAGGUUGUAUAGUU,cel
1,cel-let-7-3p MIMAT0015091 Caenorhabditis elega...,CUAUGCAAUUUUCUACCUUACC,cel


In [17]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [18]:
items = list(organism['tree'].unique())
items.sort(key=len)
items

['Viruses;',
 'Mycetozoa;',
 'Alveolata;',
 'Metazoa;Porifera;',
 'Metazoa;Cnidaria;',
 'Viridiplantae;Chlorophyta;',
 'Viridiplantae;Embryophyta;',
 'Viridiplantae;Coniferophyta;',
 'Viridiplantae;Magnoliophyta;',
 'Metazoa;Bilateria;Deuterostoma;',
 'Chromalveolata;Heterokontophyta;',
 'Metazoa;Bilateria;Ecdysozoa;Nematoda;',
 'Metazoa;Bilateria;Lophotrochozoa;Annelida;',
 'Metazoa;Bilateria;Lophotrochozoa;Nemertea;',
 'Metazoa;Bilateria;Lophotrochozoa;Mollusca;',
 'Viridiplantae;Magnoliophyta;monocotyledons;',
 'Metazoa;Bilateria;Deuterostoma;Hemichordata;',
 'Metazoa;Bilateria;Deuterostoma;Echinodermata;',
 'Metazoa;Bilateria;Lophotrochozoa;Brachiopoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Hexapoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Crustacea;',
 'Metazoa;Bilateria;Lophotrochozoa;Platyhelminthes;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Chelicerata;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Mandibulata;',
 'Viridiplantae;Magnoliophyta;eudicotyledons;Poaceae;',
 'M

In [19]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" in x)]
print(selectedTree.shape)
selectedTree.head(5)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352
68,ppt,PPT,Physcomitrella patens,Viridiplantae;Embryophyta;,3218
69,smo,SMO,Selaginella moellendorffii,Viridiplantae;Embryophyta;,88036
70,ath,ATH,Arabidopsis thaliana,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3702


In [20]:
selected = df[df['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(1)

(10414, 3)


Unnamed: 0,tag,data,organism
316,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...,UGACAGAAGAGAGUGAGCAC,ath


In [21]:
df_to_fasta(selected,'./Temp/mature_microRNA_queries.fasta')

In [22]:
# use this cell for extracting str files for hairpin.fa
'''
tags = list(selected['tag'].apply(lambda x : x.split(' ')[0]))
with open(f'./{directory}/miRNA.str', 'r') as file:
    text = file.read()
text = text.split('\n')

result = ''
for i in range(0,len(text),8):
    if(text[i].split(' ')[0][1:] in tags):
        result += '\n'.join(text[i:i+8]) + "\n"        
with open(f'./high_conf_hairpin.str', 'w') as file:
    file.write(result)
''';

# Remove redundant

## cdhit-est

In [23]:
!cdhit/cd-hit-est -i ./Temp/mature_microRNA_queries.fasta  -o ./Temp/NR_mature_microRNA_queries.fasta \
    -c 1 -r 0 -G 1 -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS 0 \
    -AS 99999999 -s 0 -S 0 

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 23 2021, 21:45:39
Command: cdhit/cd-hit-est -i
         ./Temp/mature_microRNA_queries.fasta -o
         ./Temp/NR_mature_microRNA_queries.fasta -c 1 -r 0 -G 1
         -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS 0 -AS 99999999
         -s 0 -S 0

Started: Sun Feb  6 21:48:47 2022
                            Output                              
----------------------------------------------------------------
total seq: 10414
longest and shortest : 28 and 17
Total letters: 222978
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 1 X 12M = 12M
Table           : 1 X 16M = 16M
Miscellaneous   : 0M
Total           : 30M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 96149440

comparing sequences from          0  to      10414
..........    10000  finished       5817  clusters

    10414  finished       6028  clusters

Approximate

## reformat

In [24]:
with open('./Temp/NR_mature_microRNA_queries.fasta.clstr','r') as file:
    text = file.read()
lines = [line for line in text.split('\n') if len(line) > 0]
cluster = []
seqid = []
last_cluster = ""
for l in lines:
    if(l[0]=='>'):        
        last_cluster = l.replace('>Cluster ',"C")
    else:        
        cluster.append(last_cluster)
        seqid.append(l.split(', >')[1].split('...')[0])                
seq2cluster = pd.DataFrame({'seqid': seqid,'cluster': cluster})
print(seq2cluster.shape)
seq2cluster.head(2)    

(10414, 2)


Unnamed: 0,seqid,cluster
0,cst-miR11332,C0
1,stu-miR7994b-5p,C1


In [25]:
df = fasta_to_df("./Temp/mature_microRNA_queries.fasta")
df['accession'] = df['tag'].apply(lambda x : x.split(' ')[0])
seq2cluster = pd.merge(df,seq2cluster,how="inner",left_on='accession',right_on="seqid")[['cluster','seqid','tag']]
print(seq2cluster.shape)
display(seq2cluster.head(2))
seq2cluster.to_csv('./Temp/seq2cluster.csv',index=False)

(10414, 3)


Unnamed: 0,cluster,seqid,tag
0,C5495,ath-miR156a-5p,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...
1,C1199,ath-miR156a-3p,ath-miR156a-3p MIMAT0031865 Arabidopsis thalia...


In [26]:
# todo: sorted first by cluster then by seqid
seq2cluster.sort_values("cluster").head(2)

Unnamed: 0,cluster,seqid,tag
9422,C0,cst-miR11332,cst-miR11332 MIMAT0044622 Cucumis sativus miR1...
7002,C1,stu-miR7994b-5p,stu-miR7994b-5p MIMAT0031188 Solanum tuberosum...


In [27]:
df = fasta_to_df("./Temp/NR_mature_microRNA_queries.fasta")
df['tag'] = df['tag'].apply(lambda x : x.split(' ')[0])
df = pd.merge(df,seq2cluster,how="inner",left_on='tag',right_on="seqid")[['cluster','data']]

lines = []
df.apply(lambda row: lines.append(f">{row['cluster']}\n{row['data']}\n"),axis=1)
print(df.shape)
with open('./Temp/BLASTn_queries.fasta','w') as file:
    file.write(''.join(lines))

(6028, 2)


# BlastN

!sudo apt-get install ncbi-blast+


In [28]:
!makeblastdb -in input_genome.fna \
             -dbtype nucl \
             -out ./Temp/blastn_database



Building a new DB, current time: 02/06/2022 21:48:54
New DB name:   /home/jupyter/plant_microRNA_prediction/Temp/blastn_database
New DB title:  input_genome.fna
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 126142 sequences in 9.57086 seconds.


In [29]:
header = 'qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'

In [30]:
!blastn -query ./Temp/BLASTn_queries.fasta \
        -out ./Temp/BLASTn_result \
        -num_threads {mp.cpu_count()} \
        -db ./Temp/blastn_database \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [31]:
df_blastn = pd.read_csv('./Temp/BLASTn_result', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(326849, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C5495,AMWY02099822.1,1,20,1769,1750,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.004,37.4,...,0,100.0,1/-1,1,-1,minus,100,100,20,3308
1,C5495,AMWY02082313.1,1,20,5954,5973,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.004,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,8471


In [32]:
threshold = 4
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(80217, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C5495,AMWY02099822.1,1,20,1769,1750,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.004,37.4,...,100.0,1/-1,1,-1,minus,100,100,20,3308,0
1,C5495,AMWY02082313.1,1,20,5954,5973,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.004,37.4,...,100.0,1/1,1,1,plus,100,100,20,8471,0


In [33]:
# remore redundancy and hold best one base of Nonconformity value
df_blastn = df_blastn.sort_values(["Nonconformity", "evalue"], ascending = (True, True))
df_blastn = df_blastn.drop_duplicates(subset=['sseqid','sstart', 'send','sstrand'], keep='first')
df_blastn.to_csv('./Temp/filtered_out_blastn.csv')
print(df_blastn.shape)

(66445, 28)


# Result of the blastn to bed file

In [34]:
flanking_value = 200
df = df_blastn[['qseqid', 'sseqid', 'sstart', 'send', 'sstrand','slen']]
df['ones'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ones'] = 1


In [35]:
def switch(row):
    if(row['sstart'] > row['send']):        
        temp = row['sstart']
        row['sstart'] = row['send']
        row['send'] = temp
    return row
df = df.apply(lambda row: switch(row), axis=1)

In [36]:
def convert(inp):
    if(inp == "plus"):
        return "forward"
    if(inp == "minus"):
        return "reverse"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['strand'] = df['sstrand'].apply(lambda x: convert(x))

In [37]:
def convert2sign(inp):
    if(inp == "plus"):
        return "+"
    if(inp == "minus"):
        return "-"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['sign'] = df['sstrand'].apply(lambda x: convert2sign(x))

In [38]:
df['hit_length'] = df.apply(lambda row: abs(row['send'] - row['sstart']) + 1 ,axis=1)

## convert sstart and send from location to index (range)

In [39]:
df['sstart'] = df['sstart'].apply(lambda x: x - 1)

In [40]:
df['downstream_flanking'] = df['sstart'].apply(lambda x:  flanking_value if x > flanking_value else x)

In [41]:
df['upstream_flanking'] = df.apply(lambda row:  flanking_value if (row['send']+flanking_value) <= row['slen'] else row['slen'] - row['send'],axis=1)

In [42]:
df['hit_start'] = df.apply(lambda row: row['downstream_flanking'] if row['sign'] == "+" else row['upstream_flanking'],axis=1)

In [43]:
df['hit_end'] = df.apply(lambda row: row['downstream_flanking'] + row['hit_length'] if row['sign'] == "+" else row['upstream_flanking'] + row['hit_length'],axis=1)

In [44]:
df['sstart'] = df['sstart'].apply(lambda x: max(x - flanking_value, 0))
df['send'] = df.apply(lambda row: min(row['send'] + flanking_value , row['slen']),axis=1)

In [45]:
df['tag'] = df.apply(lambda row: f">{row['sseqid']}:{row['sstart']}-{row['send']}({row['sign']})",axis=1)
df['reformated_tag'] = df['tag'].apply(lambda t: reformat(t))
df[['tag', 'reformated_tag', 'hit_start', 'hit_end']].to_csv('./Temp/hit_index_info.csv')#, index=False)

In [46]:
df['location_tag'] = df.apply(lambda row: f">{row['sseqid']}|{row['sign']}|{row['sstart'] + 1}-{row['send']}|{row['hit_start']+1}-{row['hit_end']}",axis=1)
df[['location_tag','qseqid']].to_csv('./Temp/pipe_seprated_location_list.csv',index=False,sep='\t')

In [47]:
df[['sseqid','sstart','send','strand','ones', 'sign']].to_csv('./Temp/extension_index.bed', 
        index=False, header=False, sep="\t")

# Extention


In [48]:
# !sudo apt-get install bedtools

In [49]:
!bedtools getfasta -fi ./input_genome.fna -fo ./Temp/extended_original.txt -s -bed ./Temp/extension_index.bed
!rm input_genome.fna.fai

index file ./input_genome.fna.fai not found, generating...


In [50]:
# todo: remove duplicated
'''
df = fasta_to_df("./Temp/extended.txt")
df = df.drop_duplicates(subset=['tag'], keep='first')
df_to_fasta(df,"./Temp/extended.txt")
len(df['tag'].unique())
''';

# Convert hit region to upper case and other region to lower case

In [51]:
ext = fasta_to_df('./Temp/extended_original.txt')
info = pd.read_csv('./Temp/hit_index_info.csv')
info['tag'] = info['tag'].apply(lambda x: x[1:])
print(info.shape)
info.head(2)

(66445, 5)


Unnamed: 0.1,Unnamed: 0,tag,reformated_tag,hit_start,hit_end
0,132836,AMWY02059828.1:2832-3256(+),>AMWY020598281_2832-3256_+_,200,224
1,300170,AMWY02004761.1:1853-2277(+),>AMWY020047611_1853-2277_+_,200,224


In [52]:
ext = ext.sort_values(by=['tag']).reset_index()
ext['help_tag'] = ext.apply(lambda r: r['tag']+ str(r.name),axis=1)
del ext['tag']

info = info.sort_values(by=['tag']).reset_index()
info['help_tag'] = info.apply(lambda row: row['tag']+ str(row.name),axis=1)
def redefined_tag(row):
    tag = row['tag']
    [sstart, send] = tag.split(':')[-1].split('(')[0].split('-')
    sstart = int(sstart) + 1
    sign = tag.split('(')[-1].split(')')[0]    
    return f"{tag.split(':')[0]}|{sign}|{sstart}-{send}|{row['hit_start']+1}-{row['hit_end']}"
info['tag'] = info.apply(lambda row: redefined_tag(row),axis=1)
ext = pd.merge(ext,info,how='inner', on='help_tag')

def emphasis_hit(row):
    seq = list(row['data'].lower())            
    s = row['hit_start']
    e = row['hit_end']
    seq[s:e] = list(''.join(seq[s:e]).upper())    
    return ''.join(seq)
    
ext['data'] = ext.apply(lambda row: emphasis_hit(row),axis=1)
df_to_fasta(ext[['tag','data']],"./Temp/extended_modified.txt")

# RNA 2d prediction

## Mfold

In [53]:
'''
# installation
!wget http://www.unafold.org/download/mfold-3.6.tar.gz
!tar -xvf ./mfold-3.6.tar.gz; rm ./mfold-3.6.tar.gz
%cd ./mfold-3.6
!./configure
!make
!make install
%cd ..
!sudo apt install texlive-font-utils
''';

In [54]:
#todo : add all hyperparameter(options) to GUI

In [55]:
counter = 0
base = "./secondary_structure/mfold/"
!rm -r {base}
!mkdir -p {base}
df = fasta_to_df('./Temp/extended_modified.txt')

for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)            
    with open(base + f"{tag}/SEQ.FASTA",'w') as file:
        file.write(f">{row['tag']}\n{row['data']}")
    counter += 1    
    if(counter >= 2000):
        break

rm: cannot remove './secondary_structure/mfold/': No such file or directory


In [56]:
%%capture
remove_lock = False
def run_mfold(tag):
    tag = reformat(tag)
    %cd {base + tag}
    !mfold  SEQ="SEQ.FASTA" T=20 MAX=2    
    if(not remove_lock):
        !find . -not -name "*.ct" -not -name "*.pdf" -not -name "*SEQ.FASTA" -not -type d -delete
    %cd ../../..

if __name__ == '__main__':        
    pool = mp.Pool(mp.cpu_count())  
    pool.map(run_mfold, df['tag'].iloc[:2000])

In [None]:
'''
base = "secondary_structure/mfold/"
for directory in glob.glob(f"{base}*"):    
    tag = directory[len(base):]
    ct_files = glob.glob(f'{directory}/*.ct')        
    try:
        ct_files.remove(f'{base}{tag}/SEQ.ct')
    except:
        print(directory)
        print(ct_files)
        print("*****************")
    for file in ct_files:        
        shutil.copy(file, './1.ct')
        #dot = ct2dot_bracket('./1.ct')
        #dot = dot.split('\n')
        #with open('./2.ct', 'w') as stream:
            #stream.write(bracket_to_ct(tag, dot[0] , dot[1] , "(0)"))        
        #ct1 = '\n'.join(reformatCT('./1.ct').split('\n')[1:])
        #ct2 = '\n'.join(reformatCT('./2.ct').split('\n')[1:])
        #if(ct1 != ct2):
            #print(file)
        ct = reformatCT('./1.ct')
        [nucleotide, index, values] = get_ct_data(ct)        
        #print(is_nested( index,  values))
        if(not is_nested( index,  values)):
            print("************")             
'''

## Mxfold2

In [None]:
#!wget https://github.com/keio-bioinformatics/mxfold2/releases/download/v0.1.1/mxfold2-0.1.1.tar.gz
#!pip3 install mxfold2-0.1.1.tar.gz
#!rm mxfold2-0.1.1.tar.gz

In [None]:
!mxfold2 predict ./extended.txt > secondary_structure/mxfold2_result.txt

In [None]:
df = fasta_to_df('secondary_structure/mxfold2_result.txt')
df = df.apply(lambda row: bracket_row(row) , axis=1)
df.head(2)

In [None]:
base = "./secondary_structure/mxfold2/"
!rm -r {base}
!mkdir -p {base}
for index, row in df.iterrows():    
    if(not os.path.exists(base + reformat(row['tag']))):
        os.makedirs(base + reformat(row['tag']))        
    tag = reformat(row['tag'])
    with open(base + f"{tag}/{tag}.ct",'w') as file:
        bracket = row['bracket'].split(' ')[0]
        deltaG = row['bracket'].split(' ')[1]
        ct = bracket_to_ct(row['tag'], row['data'], bracket, deltaG)
        file.write(ct)    

## SPOT-RNA

In [None]:
#!git clone https://github.com/jaswindersingh2/SPOT-RNA.git
#%cd SPOT-RNA
#!wget 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz' || wget -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1'
#!tar -xvzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz
#!sudo apt-get install python3.6
#!python3.6 -m pip install tensorflow==1.14.0 # or for gpu: tensorflow-gpu==1.14.0
#! python3.6 -m pip install -r requirements.txt

In [None]:
base = "./secondary_structure/spot_rna/"
!rm -r {base}
!mkdir -p {base}

In [None]:
!python3.6 ./SPOT-RNA/SPOT-RNA.py  --inputs ./extended.txt  --outputs '{base}'  --cpu 32 --plots True

In [None]:
!rm {base}/*.bpseq
!rm {base}/*.prob
for file in glob.glob(f"{base}*.ct"):    
    f = file[len(base):-3] # .ct        
    f = reformat(f)        
    if(not os.path.exists(base + f)):
        os.makedirs(base + f)  
    header = reformatCT(file).split("\n")[0]    
    with open(f"{base}{f}.dot", 'w') as stream:        
        stream.write(ct2dot_bracket(file))
    !RNAeval "{base}{f}.dot" -T 20 -v 
    #shutil.move(file, f"{base}{f}/{f}.ct")    

In [None]:
df = fasta_to_df('./secondary_structure/spot_rna/AMWY02059828.1:2832-3256(+).dot')
df = df.apply(lambda row: bracket_row(row) , axis=1)
bracket = df['bracket'][0].split(' ')[0]
ct = bracket_to_ct(df['tag'][0], df['data'][0], bracket, "(0)")
print(ct)

## Vienna package

In [None]:
#!wget https://www.tbi.univie.ac.at/RNA/download/ubuntu/ubuntu_20_04/viennarna_2.4.18-1_amd64.deb -O viennarna.deb
#!sudo dpkg -i ./viennarna.deb
#!sudo apt-get -f install
#!rm viennarna.deb

In [None]:
base = "./secondary_structure/viennarna/"
!rm -r {base}
!rm ./secondary_structure/viennarna_result.txt
!mkdir -p {base}

In [None]:
%cd {base}
!RNAfold --jobs=0 --infile ../../Temp/extended_modified.txt  --noPS -T 20 > ../viennarna_result.txt
%cd ../../

In [None]:
df = fasta_to_df('secondary_structure/viennarna_result.txt')
df = df.apply(lambda row: bracket_row(row) , axis=1)
print(df.shape)
df.head(2)

In [None]:
for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)      
    with open(base + f"{tag}/{tag}.ct",'w') as file:
        bracket = row['bracket'].split(' ')[0]
        deltaG = row['bracket'].split(' ')[1]
        ct = bracket_to_ct(row['tag'], row['data'], bracket, deltaG, False)
        file.write(ct)    

In [None]:
import glob
for file in glob.glob(f"{base}*.ps"):    
    f = file[len(base):-6] # _ss.ps 
    f = reformat(f)        
    shutil.move(file, f"{base}{f}/{f}.ps")    

## ContraFold

In [None]:
#!wget http://contra.stanford.edu/contrafold/contrafold_v2_02.tar.gz
#!tar -xvzf contrafold_v2_02.tar.gz && rm contrafold_v2_02.tar.gz
#%cd contrafold/src
#!make clean
#!make 
# to file must changed to be complieable # utility.hpp and optimization.c++ files

In [None]:
counter = 0
base = "./secondary_structure/contrafold/"
!rm -r {base}
!mkdir -p {base}
df = fasta_to_df('./Temp/extended.txt')

for index, row in df.iterrows():    
    tag = reformat(row['tag'])
    if(not os.path.exists(base + tag)):
        os.makedirs(base + tag)            
    with open(base + f"{tag}/{tag}.FASTA",'w') as file:
        file.write(f">{row['tag']}\n{row['data']}")
    counter += 1    
    if(counter >= 10):
        break

In [None]:
def run_contrafold(tag):
    tag = reformat(tag)    
    %cd contrafold/src
    !./contrafold predict ../..{base[1:]}{tag}/{tag}.FASTA > ../..{base[1:]}{tag}/{tag}.dot
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'r') as file:
        text = file.read()
    text = [l for l in text.split("\n") if l[:len(">structure")] != ">structure"]    
    header = text[0]
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'w') as file:
        file.write('\n'.join(text[1:]))    
    !RNAeval  ../..{base[1:]}{tag}/{tag}.dot -T 20 > ../..{base[1:]}{tag}/{tag}.dotdg    
    with open(f"../..{base[1:]}{tag}/{tag}.dotdg", 'r') as file:
        text = file.read()
    with open(f"../..{base[1:]}{tag}/{tag}.dot", 'w') as file:
        file.write(header + "\n" + text)    
    
    df = fasta_to_df(f'../..{base[1:]}{tag}/{tag}.dot')
    df = df.apply(lambda row: bracket_row(row) , axis=1)        
    tag = reformat(df['tag'][0])
    with open(f'../..{base[1:]}{tag}/{tag}.ct','w') as file:
        bracket = df['bracket'][0].split(' ')[0]        
        deltaG = df['bracket'][0].split(' ')[1]
        ct = bracket_to_ct(df['tag'][0], df['data'][0], bracket, deltaG, False)
        file.write(ct)    
    #!rm ../..{base[1:]}{tag}/{tag}.dot
    #!rm ../..{base[1:]}{tag}/{tag}.dotdg
    !rm ../..{base[1:]}{tag}/{tag}.FASTA
    %cd ../../        

if __name__ == '__main__':        
    pool = mp.Pool(mp.cpu_count() - 1)  
    pool.map(run_contrafold, df['tag'].iloc[:10])

In [None]:
s = 'CUCCCCUUGUCUACCAUCCCCAACUAGCGAGAGAGACAUUACCUACCUGAAUAGAAGAUCUCUCUCGAGCUCUCGagcucucucuuuuucuauaUCUCUGUCUCUUUGUGUCUCUGGAGCUUGUACUAACAUUAAUAUCGUGCACCAGCAGCAGUUGAAGCUGCCAGCAUGAUCUAAACUUCCUUCUCUGUAAAGGAUAGAUCGGAUCAUGUGGUAGCUUCACCUGUUGAUGGGAUCACGAAAGCGCCCCUCUUACUACUCUACAUUAAUUCUUUCUCGUUAUACAACCUCCCAGUAAGCAUGCUUUCAAAACCAACUUGAGuaaguuaauuuguuuagcuuuuguuuuuggcucuuccuuuacuuuaaauuuucucaucuggguuuuuguuauauauauguacuguuuuauauauguauuccu'
d = '............................((((((((..(...(((......))).)..))))))))(((((....)))))...................................((((.((((...(((.......(((((..(((.((((((.((((((((((.(((((((((.(.(((((((.......))))).)).).))))))))))))))))))).)))))).)))...))))).....................................)))..)))).))))....((((()))))..((((((....((((.(((((((.....)))(((.........)))................)))).))))....))))))....(((((((((((......)))))))))))....'
print(s[300],s[301])
print(d[300],d[301])

In [None]:
'''path = 'secondary_structure/contrafold/AMWY020333941_469-893_-_/AMWY020333941_469-893_-_.dot'
!RNAeval  {path} -T 20 -v'''; 

# Protein coding elimination

## get nr database

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz

## BLASTX 

In [None]:
!makeblastdb -in ./NR/nr -dbtype prot -out ./NR/nr_database

In [132]:
!head -n 100 ./Temp/extended_modified.txt > ./input_blastx.txt

In [133]:
!blastx -query ./input_blastx.txt \
        -db ./NR/nr_database \
        -out ./Temp/BlastX/blastx \
        -num_threads 20 \
        -evalue 1e-3 \
        -outfmt "6 qseqid sseqid qstart qend evalue bitscore score length frames qframe qcovs qcovhsp staxids"

In [166]:
blx = pd.read_csv('./Temp/BlastX/blastx', sep='\t', header=None)
blx.columns = 'qseqid sseqid qstart qend evalue bitscore score length frames qframe qcovs qcovhsp staxids'.split(' ')
coding_seq = blx['qseqid'].unique()

## DIAMOND

In [None]:
#!wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
#!tar xzf diamond-linux64.tar.gz

In [None]:
!./diamond makedb --in ./NR/nr -d ./DIAMOND/diamond_output

In [None]:
!./diamond blastx -d ./DIAMOND/diamond_output.dmnd\
                  -q ./blastx_query_13980507.txt\
                  -o ./matches.tsv

In [149]:
dmn = pd.read_csv("matches.tsv", sep='\t', header=None)
dmn.columns = 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'.split(' ')
coding_seq = dmn['qseqid'].unique()

array(['reverse::AMWY02100913.1:2598-2831(-)',
       'reverse::AMWY02021293.1:2174-2586(-)',
       'forward::AMWY02104056.1:317-729(+)', ...,
       'reverse::AMWY02028853.1:2318-2740(-)',
       'reverse::AMWY02009487.1:1-364(-)',
       'forward::AMWY02030457.1:5927-6350(+)'], dtype=object)

## filter

In [169]:
def clear(inp):
    if(inp[:9] == "reverse::"):
        return inp[9:]
    if(inp[:9] == "forward::"):
        return inp[9:]
    return inp
coding_seq = pd.Series(coding_seq).apply(lambda x : clear(x))

In [171]:
ext = fasta_to_df('./Temp/extended_modified.txt')
print(ext.shape)
ext = ext[~ext['tag'].isin(coding_seq)]
print(ext.shape)
df_to_fasta(ext,'./Temp/extended_modified_non_coding.txt')

(66445, 2)
(66428, 2)


# CTAnalizer

In [200]:
# only select those not ran before
base = "./secondary_structure/mfold/"
df = fasta_to_df('./Temp/extended_modified.txt')

index_list =[]
for index, row in df.iterrows():    
    tag = reformat(row['tag'])    
    if(len(glob.glob(f'{base + tag}/*.ct')) != 0):
        index_list.append(index)
df = df.iloc[index_list,:]
print(df.shape)

(2000, 2)


In [201]:
def get_tag_info(tag):    
    data = tag.split('|')
    sign = data[1]
    hit_start = int(data[3].split('-')[0]) - 1 
    hit_end = int(data[3].split('-')[1])    
    return [hit_start, hit_end, sign]

In [202]:
def get_deltaG(ct):
    ct_head = ct.split('\n')[0]
    if("dG = " in ct_head):    
        dG_patter = "dG = " 
    elif("dG= " in ct_head):    
        dG_patter = "dG= "
    elif("dG=" in ct_head):    
        dG_patter = "dG="
    elif("dG =" in ct_head):    
        dG_patter = "dG ="
    else:
        print('there is no dG')
    return float(ct_head.split(dG_patter)[-1].split(' ')[0])

In [203]:
def get_complementarity_in_hit_region(inc_srange, hit_len):    
    if(sum(inc_srange == 0) == hit_len):
        return ["no" , 0]
    elif(sum(inc_srange != 0) == hit_len):
        return ["fully_connected" , 1]
    else:    
        return ["yes",  round(sum(inc_srange != 0) / hit_len,2)]

In [204]:
def get_hit_self_complementarity(hit_start, hit_end, inc_srange):    
    if(((inc_srange <= hit_start) | (inc_srange > hit_end)).all()):
        return "no"
    return "yes"

In [205]:
def get_istar_min_max(inc_srange, hit_self_complementarity):
    nonzero_data_srange  = inc_srange[inc_srange!=0]
    if(hit_self_complementarity == 'yes'):
        return [np.nan, np.nan]
    return [nonzero_data_srange.min(), nonzero_data_srange.max()]

In [206]:
def get_continuous_pairing(hit_start, hit_end, istar_min, istar_max, hit_self_complementarity):    
    if(hit_self_complementarity == 'yes'):
        return "undifined"
    if(hit_end < istar_max and (hit_start+1) > istar_min):
        return "no"    
    return  "yes"

In [207]:
def get_mir_type(hit_start, hit_end, istar_min, istar_max, continuous_pairing, complementarity_in_hit_region, hit_self_complementarity):        
    if(continuous_pairing == "yes" and complementarity_in_hit_region != "no" and hit_self_complementarity == "no"):
        if( hit_end < istar_min):
            return "5p" 
        if( (hit_start+1) > istar_max):
            return "3p"     
    else:
        if(continuous_pairing == "no" and hit_self_complementarity == "yes"):
            return "discontinuous star strand and hit self complementarity"
        elif(continuous_pairing == "no"):
            return "discontinuous star strand"
        elif(hit_self_complementarity == "yes"):
            return "hit self complementarity"

    if(complementarity_in_hit_region == "no"):
        return "no complementarity in hit region"  
    print(hit_start, hit_end, istar_min, istar_max, continuous_pairing, complementarity_in_hit_region, hit_self_complementarity)

In [208]:
def get_star_start(hit_start, hit_end, values):
    c = 0
    i = hit_end - 3 - c
    while(values[i] == 0 and i >= 0):
        c += 1
        i = hit_end - 3 - c        
    if(values[i] - c < 1):                    
        return [max(values[i] - c,1), "negative value"]
    if(i < hit_start):
        return [values[i] - c, 'less than hit start']
    return [values[i] - c, '']

In [209]:
def get_star_end(hit_start, hit_end, values):
    if(hit_start - 2 >= 0 ):
        a = 0    
    else:
        a = abs(hit_start - 2)
    
    i = hit_start - 2 + a
    while(values[i] == 0 and i <= hit_end):
        a += 1
        i = hit_start - 2 + a
    
    if(i <= hit_end):        
        if((values[i] + a) > len(values)):
            return [len(values), "out of sequance range"]        
        return [values[i] + a, ""]
    return [np.nan, "some error happened"]

In [210]:
def get_num_of_linking_residues(hit_start,hit_end, star_start, star_end, mir_type):
    if(mir_type == '5p'):
        return str(star_start - hit_end - 1)
    elif(mir_type == '3p'):
        return str(hit_start - star_end)
    elif(mir_type == "discontinuous star strand"):
        return "discontinuous star strand"
    elif(mir_type == "no complementarity in hit region"):
        print('error')    

In [211]:
def get_star_branching(star_start, star_end, star_range, values):
    return not ((values[star_range-1] < star_start) | (values[star_range-1] > star_end)).all()

In [212]:
def getBOI_5p(hit_start, hit_end, values):
    # first calc latest non zero value
    for i in range(hit_end-1, 0, -1):
        if(values[i] != 0):
            last_v = values[i]
            last_i = i
            place = i
            break            
            
    for i in range(place-1, 0, -1):
        v = values[i]
        if(v == 0):
            continue
        if(v < last_v):
            if(last_i <= hit_start and last_i <= hit_end and last_v > hit_start and last_v >= hit_end):                
                return [last_i + 1, last_v]            
        
        if((v - last_v) >= 3):            
            s1 = set(range(last_v+1, v))
            s2 = set([values[ii-1] for ii in range(last_v+1, v)])                        
            if(len(s1.intersection(s2)) > 0):                                                
                if(last_i <= hit_start and last_i < hit_end and last_v > hit_start and last_v >= hit_end): #?????                     
                    return [last_i + 1, last_v]    
        last_v = v            
        last_i = i            
    for i in range(0,hit_start):
        if(values[i] != 0 ):
            if(last_i <= hit_start and last_i <= hit_end and last_v > hit_start and last_v >= hit_end):                    
                return [i + 1, values[i]]          
    return [np.nan, np.nan]    
    
                
def getBOI_3p(hit_start, hit_end, values):
    # first calc latest non zero value
    for i in range(hit_start, len(values)):    
        if(values[i] != 0):
            last_v = values[i]
            last_i = i
            place = i
            break            
            
    for i in range(place + 1, len(values)):
        v = values[i]
        if(v == 0):
            continue
        if(v > last_v):
            if((last_v-1) <= hit_start and (last_v-1) <= hit_end and (last_i+1)  > hit_start and (last_i+1)  >= hit_end):    
                return [last_v, last_i + 1]                                                                                    
        if((last_v - v) >= 3):
            s1 = set(range(v+1, last_v))
            s2 = set([values[ii-1] for ii in range(v+1, last_v)])
            if(len(s1.intersection(s2)) > 0):
                if((last_v-1) <= hit_start and (last_v-1) < hit_end and (last_i+1)  > hit_start and (last_i+1)  >= hit_end):
                    return [last_v, last_i + 1]    
        last_v = v            
        last_i = i            
    for i in range(len(values)-1, hit_end-2, -1):  # changed!        
        if(values[i] != 0 ):
            if((last_v-1) <= hit_start and (last_v-1) <= hit_end and (last_i+1)  > hit_start and (last_i+1)  >= hit_end):                    
                return [values[i], i + 1]    
    return [np.nan, np.nan]
    
    
def get_boi(hit_start, hit_end, values, mir_type):    
    if(mir_type == '5p'):
        return getBOI_5p(hit_start, hit_end, values)
    if(mir_type == '3p'):
        return getBOI_3p(hit_start, hit_end, values)

In [213]:
def get_terminal_structure_range(hit_start, hit_end, istar_min, istar_max, mir_type):
    if(mir_type == '5p'):
        return [i for i in range(hit_end, istar_min-1)]
    if(mir_type == '3p'):
        return [i for i in range(istar_max, hit_start)]
    print("Error in get_terminal_structure_range function")        

In [214]:
def get_number_of_terminal_structure(values, terminal_structure_range):    
    data = values[terminal_structure_range]
    data = data[data != 0].to_numpy()
    if(len(data) == 0):
        return 0           
    counter = 1            
    last = data[0]         
    for i in range(1,len(data)): 
        if(data[i] > last):
            counter += 1
        last = data[i]        
    return counter 

In [215]:
def get_branch_star_end_point(values, terminal_structure_range):        
    data = values[terminal_structure_range]    
    index = np.array(terminal_structure_range)[data != 0]    
    data = data[data != 0].to_numpy()                        
    branch_start_index = []
    branch_end_index = []
    branch_start_index.append(index[0])
    last = data[0]            
    for i in range(1,len(data)): 
        if(data[i] > last):
            branch_end_index.append(index[i-1])
            branch_start_index.append(index[i])
        last = data[i]        
    branch_end_index.append(index[-1])
    #
    branch_start_point = []
    branch_end_point = []
    for i in range(0, len(branch_start_index)):         
        i_s = branch_start_index[i]
        i_e = branch_end_index[i]
        v_s = values[i_s]
        v_e = values[i_e]
        if(v_s > i_s and v_s <= (i_e + 1)):
            branch_start_point.append(i_s + 1)
            branch_end_point.append(v_s)
        elif(v_e > i_s and v_e <= (i_e + 1)):
            branch_start_point.append(v_e)
            branch_end_point.append(i_e + 1)        
    return [branch_start_point, branch_end_point]

In [216]:
def get_branch_apical_loop_size(branch_start_point, branch_end_point, values):        
    branch_apical_loop_start = []
    branch_apical_loop_end = []
    branch_apical_loop_size = []
    for s,e in zip(branch_start_point, branch_end_point):
        data = values[s-1: e]
        index = np.array([i for i in range(s-1, e)])[data != 0]                    
        for i in range(len(index)-1):
            if(values[index[i+1]] == index[i]+1 and values[index[i]] == index[i+1]+1):
                branch_apical_loop_start.append(index[i]+1)
                branch_apical_loop_end.append(index[i+1]+1)
                branch_apical_loop_size.append(index[i+1] - index[i] - 1)                                                
    return [branch_apical_loop_start, branch_apical_loop_end, branch_apical_loop_size]

In [217]:
def get_stem_last_residue(branch_apical_loop_start,branch_apical_loop_end, mir_type):
    out = []
    for i in range(len(branch_apical_loop_start)):
        if(mir_type == '5p'):
            out.append(min(branch_apical_loop_start[i], branch_apical_loop_end[i]))
        if(mir_type == '3p'):
            out.append(max(branch_apical_loop_start[i], branch_apical_loop_end[i]))
    return out

In [218]:
def get_branch_stem_length(branch_start_point, branch_apical_loop_start):
    out = []
    for i in range(len(branch_start_point)):
        out.append(branch_apical_loop_start[i] - branch_start_point[i] + 1)
    return out

In [219]:
def get_primary_stem_end_point(branch_start_point, branch_end_point, stem_last_residue, hit_start, hit_end, istar_min, istar_max, values, number_of_terminal_structure, mir_type):    
    if(number_of_terminal_structure == 1):
        return stem_last_residue[0]
    if(mir_type == '5p'):
        if(number_of_terminal_structure == 0):                            
            for i in range(hit_end-1, hit_start-1,-1):
                if(values[i] != 0):
                    return i + 1
        else:
            a = -1                        
            for i in range(branch_start_point[0]-2, hit_end-1, -1):                
                if(values[i] != 0):
                    a = i + 1
                    break            
            b = -1            
            for i in range(branch_end_point[-1], istar_min - 1):
                if(values[i] != 0):
                    b = values[i]
                    break                   
            if(a == -1 or b == -1):
                return np.nan
            return min(a,b)
            
    if(mir_type == '3p'):
        if(number_of_terminal_structure == 0):                
            for i in range(hit_start, hit_end):
                if(values[i] != 0):
                    return i + 1 
        else:
            a = -1
            for i in range(branch_end_point[-1], hit_start):
                if(values[i] != 0):
                    a = i + 1
                    break            
            
            b = -1            
            for i in range(branch_start_point[0]-2, istar_max-1, -1):                
                if(values[i] != 0):
                    b = values[i]
                    break                    
            
            if(a == -1 or b == -1):
                return np.nan
            return max(a,b)

In [220]:
def get_primary_stem_length(primary_stem_end_point, branch_start_point, branch_end_point, stem_last_residue, hit_start, hit_end, values, number_of_terminal_structure ,mir_type):    
    if(number_of_terminal_structure == 0):
        return 0
    if(mir_type == '5p'):
        if(number_of_terminal_structure == 1):                
            return stem_last_residue[0] - branch_start_point[0] + 1
        else:                        
            return primary_stem_end_point - hit_end
            
    if(mir_type == '3p'):
        if(number_of_terminal_structure == 1):                
            return branch_end_point[0] - stem_last_residue[0] + 1
        else:                        
            return (hit_start+1) - primary_stem_end_point

In [221]:
def get_domain(primary_stem_end_point, boi_start, boi_end, stem_last_residue, hit_start, hit_end, mir_type):    
    if(mir_type == '5p'):        
        return range(boi_start-1, primary_stem_end_point)                                        
    if(mir_type == '3p'):        
        return range(primary_stem_end_point-1, boi_end)

In [222]:
def get_domain_star(primary_stem_end_point_star, boi_start, boi_end, stem_last_residue, hit_start, hit_end, mir_type):    
    if(mir_type == '5p'):        
        return range(primary_stem_end_point_star - 1, boi_end)                                        
    if(mir_type == '3p'):        
        return range(boi_start-1 , primary_stem_end_point_star)

In [223]:
def get_interfering_structures(domain, values):
    [c, d] = [min(domain[0], domain[-1]), max(domain[0], domain[-1])]    
    v = values[c-1:d]    
    return not ((v < c) | (v > d)).all()

In [381]:
def getLocation(start, end, hit_start, hit_end, mir_type):            
    def _location(point): # base location              
        if(mir_type == "5p"):                        
            if(point < (hit_start+1)):
                return ["loop distal", (hit_start+1) - point]
            if(point <= hit_end):
                return ["hit region", point - (hit_start+1) + 1]
            return ["loop proximal" , point - hit_end]
        
        if(mir_type == "3p"):    
            if(point > hit_end):
                return ["loop distal" , point - hit_end]
            if(point >= (hit_start+1)):
                return ["hit region", point - (hit_start+1) + 1]
            return ["loop proximal", (hit_start+1) - point]                                        
            
    [type1, loc1] = _location(start)
    [type2, loc2] = _location(end)    
    if(type1 == type2):                
        return [type1, min(loc1, loc2), max(loc1, loc2)]
    
    if((type1 == "loop distal" and type2 == "hit region") or
       (type2 == "loop distal" and type1 == "hit region")):                
        return ["distal border line", loc1, loc2]
        
    if((type1 == "loop proximal" and type2 == "hit region") or
       (type2 == "loop proximal" and type1 == "hit region")):        
        return ["proximal border line", loc1, loc2]
    
    raise exception("loop proximal and loop distal")

In [382]:
def get_mismatch(domain, values, MCMA, hit_start, hit_end, mir_type): #MCMA: maximum consecutive mismatch allowance    
    size = []
    location_type = []
    location_start = []
    location_end = []
    if(values[domain[0]] == 0 or values[domain[-1]] == 0):
        raise Exception("Domain start or end = 0")        
        return "Domain start or end = 0"
    mismatch_counter = 0 
    zero_counter = 0
    last = values[domain[0]]    
    lastI = domain[0]
    for d in domain[1:]:
        if(values[d] == 0):
            zero_counter += 1
        else:            
            current = values[d]
            if(current > last):
                return ["Increment series error", None, None, None, None]
            elif(current < last and zero_counter > 0):                
                if(last - current - 1 == zero_counter and zero_counter <= MCMA):
                    mismatch_counter += 1
                    size.append(zero_counter)                    
                    [loc_type, loc_start, loc_end ] = getLocation(lastI+2, d, hit_start, hit_end, mir_type)
                    location_type.append(loc_type)
                    location_start.append(loc_start)
                    location_end.append(loc_end)                    
                zero_counter = 0                                    
            last = current
            lastI = d
    if(mir_type == "3p"):
        size = size[::-1]
        location_type = location_type[::-1]
        location_start = location_start[::-1]
        location_end = location_end[::-1]
    return [mismatch_counter, size, location_type, location_start, location_end]

In [383]:
def get_bulge(domain, values, hit_start, hit_end, mir_type): 
    size = []  
    bulge_type = []
    location_type = []
    location_start = []
    location_end = []
    zero_counter = 0
    last = values[domain[0]]  
    lastI = domain[0]
    for d in domain[1:]:
        if(values[d] == 0):
            zero_counter += 1
        else:
            current = values[d]
            if(current > last):
                return ["Increment series error", None, None, None, None,None]
            
            if(last - current == 1 and zero_counter > 0):                
                size.append(zero_counter)                            
                [loc_type, loc_start, loc_end ] = getLocation(lastI+1, d+1, hit_start, hit_end, mir_type)
                bulge_type.append("zero")                                
                location_start.append(loc_start)
                location_end.append(loc_end)           
                [loc_type,_,_ ] = getLocation(lastI+2, d, hit_start, hit_end, mir_type)
                location_type.append(loc_type)
                
            if(last - current > 1 and zero_counter == 0):                
                size.append(last - current - 1)                
                [loc_type, loc_start, loc_end ] = getLocation(lastI+2, d, hit_start, hit_end, mir_type)
                if(loc_type == "distal border line"):
                    loc_type = "loop distal"
                if(loc_type == "proximal border line"):
                    loc_type = "loop proximal"
                bulge_type.append("jump")
                location_type.append(loc_type)
                location_start.append(loc_start)
                location_end.append(loc_end)                    
                                            
            zero_counter = 0                                                    
            last = current
            lastI = d
    if(mir_type == "3p"):
        size = size[::-1]
        bulge_type = bulge_type[::-1]
        location_type = location_type[::-1]
        location_start = location_start[::-1]
        location_end = location_end[::-1]
    return [len(size), size, location_type, location_start, location_end, bulge_type]

In [384]:
def get_internal_loop(domain, values, MCMA, hit_start, hit_end, mir_type): #MCMA: maximum consecutive mismatch allowance
    size_HSBL = []  # number of         
    size_SSBL = []     
    location_type = []
    location_start = []
    location_end = []
    zero_counter = 0
    last = values[domain[0]]    
    lastI = domain[0]
    for d in domain[1:]:
        if(values[d] == 0):
            zero_counter += 1
        else:
            current = values[d]            
            if(current > last):
                return ["Increment series error", None, None, None, None, None]        
            if(current < last and zero_counter > 0):                
                jump = last - current - 1
                if(jump == 0):
                    zero_counter = 0                                    
                elif(jump != zero_counter):                                        
                    size_HSBL.append(zero_counter)
                    size_SSBL.append(jump)
                    [loc_type, loc_start, loc_end ] = getLocation(lastI+2, d, hit_start, hit_end, mir_type)
                    location_type.append(loc_type)
                    location_start.append(loc_start)
                    location_end.append(loc_end)                    
                elif(zero_counter > MCMA):
                    size_HSBL.append(zero_counter)
                    size_SSBL.append(jump)
                    [loc_type, loc_start, loc_end ] = getLocation(lastI+2, d, hit_start, hit_end, mir_type)
                    location_type.append(loc_type)
                    location_start.append(loc_start)
                    location_end.append(loc_end)                    
                    
            zero_counter = 0                                    
            last = current
            lastI = d
    if(mir_type == "3p"):        
        size_SSBL = size_SSBL[::-1]
        size_HSBL = size_HSBL[::-1]
        location_type = location_type[::-1]
        location_start = location_start[::-1]
        location_end = location_end[::-1]
    return [len(size_HSBL), size_HSBL, size_SSBL, location_type, location_start, location_end]

In [385]:
def get_distance_info(inp_type, inp_bord_type, mis_loc_type, mismatch_size, mis_start, bulge_size, bulge_loc_type, bulge_start,bulge_end, bulge_type,internal_loop,size_HSBL,size_SSBL,intr_loc_type,intr_start,intr_end, mir_type):
    counter = 0 
    data = []
    # mismatch
    if(mis_loc_type != None):
        for i in range(0, len(mis_loc_type)):
            if(mis_loc_type[i] == inp_type):
                data.append({'start':mis_start[i],
                             'size': mismatch_size[i],
                             'type':"mismatch"})
    # bulge
    if(bulge_loc_type != None):
        for i in range(0, len(bulge_loc_type)):
            if(bulge_loc_type[i] == inp_type):
                data.append({'start':bulge_start[i],
                              'size': bulge_size[i],
                              'end': bulge_end[i],
                              'bulge_type': bulge_type[i],
                              'type':"bulge"})
            if(bulge_loc_type[i] == inp_bord_type and
                bulge_type[i] == "zero"):
                if(inp_bord_type == "distal border line"):
                    if(mir_type == "5p"):
                        counter = bulge_start[i] - 1 
                    if(mir_type == "3p"):
                        counter = bulge_end[i] - 1 
                if(inp_bord_type == "proximal border line"):
                    if(mir_type == "5p"):
                        counter = bulge_end[i] - 1 
                    if(mir_type == "3p"):
                        counter = bulge_start[i] - 1 
    # loop
    if(intr_loc_type != None):
        for i in range(0, len(intr_loc_type)):
            if(intr_loc_type[i] == inp_type):
                data.append({'start': intr_start[i],
                         'HSBL' : size_HSBL[i],
                         'SSBL' : size_SSBL[i],
                         'end'  : intr_end[i],
                         'type' : "loop"})
            if(intr_loc_type[i] == inp_bord_type and
               size_HSBL[i] > size_SSBL[i] and
               intr_end[i] > size_SSBL[i]):
                counter = intr_end[i] - size_SSBL[i]    
    if(len(data) == 0):
        return [[], [], counter]
    data.sort(key=lambda x: x['start'], reverse=False)
    #
    output = []
    outputhr = []
    for d in data:
        # todo
        dist = d['start'] - counter - 1
        if(d['type']== "mismatch"):
            output.append({'type':"mismatch",
                           'dist': dist,
                           'size': d['size']})
            outputhr.append(f"mismatch=dist:{dist}, size:{d['size']}")
        if(d['type']== "bulge"):
            dist = d['start'] - counter
            output.append({'type':"bulge",
                           'dist': dist,
                           'size': d['size']})
            outputhr.append(f"bulge=dist:{dist}, size:{d['size']}")
            if(d['bulge_type'] == "zero"):
                counter +=  d['size']
        if(d['type']== "loop"):
            dist = d['start'] - counter - 1
            _size = str(d['HSBL'])+ " + " + str(d['SSBL'])
            output.append({'type':"loop",
                           'dist': dist,
                           'size': _size})
            outputhr.append(f"loop=dist:{dist}, size:{_size}")
            if(d['HSBL'] > d['SSBL']):
                counter += d['HSBL'] - d['SSBL']
    return [output,outputhr, counter]

In [386]:
def closestto(data, datahr,number=15):
    for d in data:
        if(d['type'] == "mismatch"):
            data.remove(d)
    dist = []
    if(len(data)==0):
        return ""
    for d in data:
        dist.append(abs(d['dist'] - number))
    min_dist = min(dist)
    out = []
    for i in range(0,len(dist)):
        if(dist[i] == min_dist):
            out.append(data[i])
            
    output = []
    for o in out:
        output.append(f"{o['type']}=dist:{o['dist']}, size:{o['size']}")                
    return output

In [387]:
def get_gc_content(seq):
    freq = pd.Series([c.lower() for c in seq]).value_counts()
    for i in ['c','g', 's']:
        if i not in freq:
            freq[i] = 0    
    return round((freq['c'] + freq['g'] + freq['s']) / len(seq),2)

In [388]:
def get_boi_dist(boi_start, boi_end, hit_start, hit_end, mir_type, counter):    
    if(mir_type == '5p'):        
        return (hit_start + 1) - boi_start  - counter 
    if(mir_type == '3p'):
        return  boi_end - hit_end - counter 

def get_psep_dist(psep, mir_type, hit_start, hit_end, counter):
    if(mir_type == '5p'):
        return abs(psep - hit_end) - counter
    if(mir_type == '3p'):
        return  abs((hit_start + 1) - psep)  - counter

In [389]:
def get_junction_distance(data, dist, thresh_bulge, thresh_loop):
    distance = []
    for d in data:
        if(d['type'] == 'loop'):
            size = eval(d['size'])
            if(size >= thresh_loop):                
                distance.append(d['dist'])
        if(d['type'] == 'bulge'):
            if(d['size'] >= thresh_bulge):
                distance.append(d['dist'])
    distance.append(dist)
    return min(distance)

In [430]:
def get_ct2dot_bracket(nucleotide, index, values):
    text = ''.join(nucleotide) + "%5Cn"
    watch = []
    for i, v in zip(index,values):
        if(v == 0):
            text += '.'
        else:
            if( v not in watch):
                text += '('
                watch.append(i)
            if( v in watch):
                text += ')'
    return text

In [431]:
def get_visualization_link(dotbracket, color):
    base ="http://nibiru.tbi.univie.ac.at/forna/forna.html"        
    return f"{base}?id=fasta&file=%3Eheader%5Cn{dotbracket}&colors=%3Eheader%5Cnrange%5C%3Dwhite:blue{color}"

In [498]:
def visualization(nucleotide, index, values, hit_start, hit_end, boi_start, boi_end, star_start_real, start_end_real ):    
    dotbracket = get_ct2dot_bracket(nucleotide, index, values)
    colors = ""
    for i in range(0,len(index)):
        v = i + 1 
        if( (hit_start+1) <= v and v  <= hit_end):
            colors += "%5Cn1.5"
            continue
        if(star_start_real != None and start_end_real != None):
            if( star_start_real <= v and v <= start_end_real):
                colors += "%5Cn0.8"
                continue
        if(boi_start != None and boi_end != None):
            if( boi_start <= v and v <= boi_end):
                colors += "%5Cn0.3"
                continue        
        colors += "%5Cn0"
    path = get_visualization_link(dotbracket, colors)    
    return path
    #return f'=HYPERLINK("{path}","url")'

In [499]:
def get_trim_data(nucleotide, index, values, start, end):
    _n = nucleotide.copy()[start-1:end].reset_index(drop=True)
    _i = (index.copy()[start-1:end] - (start - 1)).reset_index(drop=True)
    _v = values.copy()[start-1:end].apply(lambda x: 0 if x==0 else  x - (start - 1)).reset_index(drop=True)   
    return [_n, _i, _v]

In [500]:
def get_precursor_seq(hit_start, hit_end,istar_min, istar_max, star_start_real, star_end_real, mir_type):
    if(mir_type == "3p"):
        return [star_start_real, hit_end, [hit_end-1, hit_end]]
    if(mir_type == "5p"):
        return [hit_start+1, star_end_real,[i+1 for i in range(istar_max, star_end_real)]]

In [501]:
server_url = "http://jupyter.sysmanager.ir/tree/plant_microRNA_prediction"
#MCMA: maximum consecutive mismatch allowance
def get_row(tag, path, extra, acceptable_terminal_structures = 5, MCMA=2, effective_bulge_size_in_Hit_vicinity_regions=4,            
effective_internal_loop_size_in_Hit_vicinity_regions=5):    
    result = {}    
    ct = reformatCT(path)
    result['seq name'] = tag
    result['ct name'] = "Fold " + path[-20:].split('SEQ_')[1].split('.ct')[0]
    result['ct'] = f'=HYPERLINK("{server_url + path[1:]}","ct")'
    result['pdf'] = f'=HYPERLINK("{server_url + path[1:-3] + ".pdf"}","pdf")'     
    #print(result['pdf'])
    [hit_start, hit_end, sign] = get_tag_info(tag)    
    result['hit start'] = hit_start + 1
    result['hit end'] =  hit_end
    result['sign'] = sign
    dg = get_deltaG(ct)
    result['delta G'] = dg
    [nucleotide, index, values] = get_ct_data(ct)
    result['full seq'] = ''.join(nucleotide)    
    hit_seq = ''.join(nucleotide[hit_start:hit_end])
    result['full seq visualization'] = visualization(nucleotide, index, values, hit_start, hit_end, None, None,None, None)
    result['hit seq'] = hit_seq
    hit_range = index[hit_start:hit_end]
    hit_len = len(hit_range)
    result['hit len'] = hit_len
    flanking_gc_content = get_gc_content(nucleotide)
    result['flanking GC content'] =  flanking_gc_content
    result['flanking MFEI'] = ((dg / len(nucleotide)) * 100) /  flanking_gc_content 
    result['hit GC content'] =  get_gc_content(hit_seq)
    inc_srange = values[hit_start:hit_end] # Incomplete_Star_range    
    [complementarity_in_hit_region, complementarity_in_hit_region_percentage] = get_complementarity_in_hit_region(inc_srange, hit_len)    
    result['complementarity in hit region'] = complementarity_in_hit_region 
    result['hit complementarity percentage']  = complementarity_in_hit_region_percentage
    if(complementarity_in_hit_region == "no"):        
        result['message'] = "no complementarity in hit region"        
        return pd.Series(result) 
    
    hit_self_complementarity = get_hit_self_complementarity(hit_start, hit_end, inc_srange)    
    result['hit self complementarity'] = hit_self_complementarity       
    if(hit_self_complementarity == "yes"):        
        result['message'] = "hit self complementarity"
        return pd.Series(result)     
    if(hit_start - extra < 0 or (len(values) - hit_end) < extra):        
        result['message'] = "Not enough flanking for hit region"                
        return pd.Series(result) 
    
    [flanking_istar_min, flanking_istar_max] = get_istar_min_max(values[(hit_start-extra):(hit_end+extra)], hit_self_complementarity)  
    result['flanking istar min']  = flanking_istar_min
    result['flanking istar max']  = flanking_istar_max    
    
    continuous_pairing = get_continuous_pairing(hit_start, hit_end, flanking_istar_min, flanking_istar_max, hit_self_complementarity)
    result['continuous pairing'] = continuous_pairing    
    if(continuous_pairing == "no"):
        result['message'] = "discontinuous star strand"
        return pd.Series(result) 
    
    [istar_min, istar_max] = get_istar_min_max(inc_srange, hit_self_complementarity)  
    result['istar min']  = istar_min
    result['istar max']  = istar_max
    
    mir_type = get_mir_type(hit_start, hit_end, istar_min, istar_max, continuous_pairing, complementarity_in_hit_region, hit_self_complementarity)
    result['mir type'] = mir_type    
    if(mir_type not in ['3p', '5p']):        
        result['message'] = mir_type
        return pd.Series(result) 
    try: 
        [star_start, star_start_msg] = get_star_start(hit_start, hit_end, values)
        [star_end, star_end_msg] = get_star_end(hit_start, hit_end, values)
        result['star start'] = star_start 
        result['star start msg'] = star_start_msg     
        result['star end'] = star_end    
        result['star end msg'] =  star_end_msg
    except:
        result['message'] = 'Error in calculation of star start and end'        
        return pd.Series(result) 
    
    star_start_real = star_start
    star_end_real = star_end
    star_start = istar_min ############################################
    star_end = istar_max   ############################################
    #set1 = set(range(star_start-1 , star_end))
    set1 = set(range(star_start-2 , star_end+1)) ######################
    set2 = set(range(hit_start, hit_end))            
    if(len(set1.intersection(set2)) > 0):        
        result['message'] = 'overlap between miRNA and miRNA*'        
        return pd.Series(result) 
    
    star_range = index[star_start - 1:star_end]
    star_seq = ''.join(nucleotide[star_start - 1:star_end])
    result['star seq'] = star_seq
    num_of_linking_residues = get_num_of_linking_residues(hit_start,hit_end, star_start, star_end, mir_type)
    result['num of linking residues'] = num_of_linking_residues
    #print(result)
    star_branching = get_star_branching(star_start, star_end, star_range, values)
    #star_branching = get_star_branching(istar_min, istar_max, inc_srange, values)
    result['star branching'] = "yes" if star_branching else "no"    
    [boi_start, boi_end] = get_boi(hit_start, hit_end, values, mir_type)                
    if(math.isnan(boi_start) or math.isnan(boi_end)):        
        result['message'] = 'unfit BOI structure'
        return pd.Series(result)    
    boi_seq = ''.join(nucleotide[boi_start-1: boi_end].tolist())
    result['boi start'] = boi_start
    result['boi end'] =  boi_end
    result['boi seq'] =  boi_seq    
    result['boi GC content'] =  get_gc_content(boi_seq)
    result['full seq visualization'] = visualization(nucleotide, index, values, hit_start, hit_end, boi_start,  boi_end, star_start_real, star_end_real)
    terminal_structure_range = get_terminal_structure_range(hit_start, hit_end, istar_min, istar_max, mir_type)
    [_n, _i, _v] = get_trim_data(nucleotide, index, values, boi_start, boi_end)
    result['boi dotbracket'] = get_ct2dot_bracket(_n, _i, _v).split("%5Cn")[1]
    result['boi visualization'] = visualization(_n, _i, _v, hit_start - (boi_start - 1), hit_end - (boi_start - 1), 1,  boi_end - (boi_start - 1), star_start_real- (boi_start - 1), star_end_real- (boi_start - 1))
    [s, e, zero] = get_precursor_seq(hit_start, hit_end,istar_min, istar_max, star_start_real, star_end_real, mir_type)
    [_n, _i, _v] = get_trim_data(nucleotide, index, values, s, e)
    for z in zero:
        _v[z - (s - 1) - 1] = 0
    result['precursor dotbracket'] = get_ct2dot_bracket(_n, _i, _v).split("%5Cn")[1]
    result['precursor seq visualization'] = visualization(_n, _i, _v, hit_start - (s - 1), hit_end - (s - 1), 1,  e - (s - 1),star_start_real- (s - 1), star_end_real- (s - 1))
    result['terminal structure range'] = [i+1 for i in [terminal_structure_range[0], terminal_structure_range[-1]]]                            
    if(len(terminal_structure_range) == 0):        
        result['number of terminal structures'] = "no residues between miR and miR*"         
    else:                
        number_of_terminal_structure = get_number_of_terminal_structure(values, terminal_structure_range)        
        if(number_of_terminal_structure == 0):
            result['number of terminal structures'] = 1                    
            #[branch_start_point, branch_end_point] = [[terminal_structure_range[0]+1], [terminal_structure_range[-1]+1]]
            [branch_start_point, branch_end_point] = [[], []]   
            stem_last_residue = []
        elif(number_of_terminal_structure == 1):
            result['number of terminal structures'] = 1        
            [branch_start_point, branch_end_point] = [[terminal_structure_range[0]+1], [terminal_structure_range[-1]+1]]            
            stem_last_residue = []
        else:
            result['number of terminal structures'] = number_of_terminal_structure
            [branch_start_point, branch_end_point]  = get_branch_star_end_point(values, terminal_structure_range)         
        if(number_of_terminal_structure != 0):
            #[branch_apical_loop_start, branch_apical_loop_end, branch_apical_loop_size] = [[branch_start_point[0]], [branch_end_point[0]], [abs(branch_end_point[0] - branch_start_point[0]) + 1]]                    
            [branch_apical_loop_start, branch_apical_loop_end, branch_apical_loop_size] = get_branch_apical_loop_size(branch_start_point, branch_end_point, values)
            stem_last_residue = get_stem_last_residue(branch_apical_loop_start,branch_apical_loop_end, mir_type)
            branch_stem_length = get_branch_stem_length(branch_start_point, branch_apical_loop_start)                        
        for i in range(acceptable_terminal_structures):
            if(i < len(branch_start_point)):
                result[f'branch#{i + 1} start point'] = branch_start_point[i]
                result[f'branch#{i + 1} end point'] = branch_end_point[i]
                result[f'branch#{i + 1} total length'] = abs(branch_end_point[i] - branch_start_point[i]) + 1                                                
                result[f'branch#{i + 1} apical loop start'] = branch_apical_loop_start[i]
                result[f'branch#{i + 1} apical loop end'] = branch_apical_loop_end[i]
                result[f'branch#{i + 1} apical loop size'] = branch_apical_loop_size[i]                    
                if(number_of_terminal_structure == 1):
                    result[f'branch#{i + 1} stem last residue'] = stem_last_residue[i]
                else:
                    result[f'branch#{i + 1} stem last residue'] = ""
                result[f'branch#{i + 1} stem length'] = branch_stem_length[i]                
            else:
                result[f'branch#{i + 1} start point'] = ""
                result[f'branch#{i + 1} end point'] = ""            
                result[f'branch#{i + 1} total length'] = ""
                result[f'branch#{i + 1} apical loop start'] = ""
                result[f'branch#{i + 1} apical loop end'] = ""
                result[f'branch#{i + 1} apical loop size'] = ""
                result[f'branch#{i + 1} stem last residue'] = ""
                result[f'branch#{i + 1} stem length']  = ""           
        
        
        primary_stem_end_point = get_primary_stem_end_point(branch_start_point, branch_end_point, stem_last_residue, hit_start, hit_end, istar_min, istar_max, values, number_of_terminal_structure, mir_type)                
        if(not np.isnan(primary_stem_end_point)):
            primary_stem_end_point_star = values[primary_stem_end_point-1]
            result['psep'] = primary_stem_end_point
            result['psep*'] = primary_stem_end_point_star
            if(number_of_terminal_structure == 0):
                result[f'branch#{1} apical loop start'] = primary_stem_end_point
                result[f'branch#{1} apical loop end'] = primary_stem_end_point_star
                result[f'branch#{1} apical loop size'] = primary_stem_end_point_star - primary_stem_end_point - 1 
                result[f'branch#{1} stem length']  = 0       
            primary_stem_length = get_primary_stem_length(primary_stem_end_point, branch_start_point, branch_end_point, stem_last_residue, hit_start, hit_end, values, number_of_terminal_structure, mir_type)
            result['primary stem length'] = primary_stem_length                            
            
            domain = get_domain(primary_stem_end_point, boi_start, boi_end, stem_last_residue, hit_start, hit_end, mir_type)
            result['domain'] = [domain[0]+1, domain[-1]+1]
            domain_star = get_domain_star(primary_stem_end_point_star, boi_start, boi_end, stem_last_residue, hit_start, hit_end, mir_type)
            result['domain*'] = [domain_star[0] + 1, domain_star[-1] + 1]
            interfering_structures_domain = get_interfering_structures(domain, values)
            result['domain interfering structures'] = "yes" if interfering_structures_domain else "no"
            
            interfering_structures_domain_star = get_interfering_structures(domain_star, values)
            result['domain* interfering structures'] = "yes" if interfering_structures_domain_star else "no"
                        
            [mismatch, mismatch_size, mis_loc_type, mis_start, mis_end] = get_mismatch(domain, values,MCMA, hit_start, hit_end, mir_type)            
            result['mismatch'] = mismatch
            result['mismatch size'] = mismatch_size
            result['mismatch type'] = mis_loc_type
            result['mismatch start'] = mis_start
            result['mismatch end'] = mis_end
            [bulge, bulge_size, bulge_loc_type, bulge_start, bulge_end,bulge_type] = get_bulge(domain, values, hit_start, hit_end, mir_type)
            result['bulge'] = bulge
            result['bulge size'] = bulge_size
            result['bulge type'] = bulge_loc_type
            result['bulge start'] = bulge_start
            result['bulge end'] = bulge_end
            [internal_loop, size_HSBL, size_SSBL, intr_loc_type, intr_start, intr_end] = get_internal_loop(domain, values, MCMA, hit_start, hit_end, mir_type)
            result['internal loop'] = internal_loop
            result['internal loop HSBL'] = size_HSBL
            result['internal loop SSBL'] = size_SSBL
            if(size_SSBL != None):
                result['internal loop total size'] = [size_SSBL[i] + size_HSBL[i] for i in range(len(size_SSBL))]
            else:
                result['internal loop total size'] = '-'
            result['internal type'] = intr_loc_type
            result['internal start'] = intr_start
            result['internal end'] = intr_end
            [proximal, proximal_hr, proximal_counter] = get_distance_info("loop proximal","proximal border line", mis_loc_type,mismatch_size, mis_start, bulge_size, bulge_loc_type, bulge_start,bulge_end, bulge_type,internal_loop,size_HSBL,size_SSBL,intr_loc_type,intr_start,intr_end,mir_type)
            result['proximal distance'] = proximal_hr
            [distal, distal_hr, distal_counter] = get_distance_info("loop distal","distal border line", mis_loc_type,mismatch_size, mis_start, bulge_size, bulge_loc_type, bulge_start,bulge_end, bulge_type,internal_loop,size_HSBL,size_SSBL,intr_loc_type,intr_start,intr_end, mir_type)
            result['distal distance'] = distal_hr
            boi_dist = get_boi_dist(boi_start, boi_end, hit_start, hit_end, mir_type, distal_counter)
            result['base structure corrected length'] = boi_dist
            psep_dist = get_psep_dist(primary_stem_end_point, mir_type, hit_start, hit_end, proximal_counter)
            result['primary stem corrected length'] = psep_dist  
            result['proximal closest to 15'] = closestto(proximal,15) 
            result['proximal closest to 21'] = closestto(proximal,21) 
            result['proximal closest to 36'] = closestto(proximal,36)
            result['distal closest to 15'] = closestto(distal,15)
            result['distal closest to 21'] = closestto(distal,21)
            result['distal closest to 36'] = closestto(distal,36)
            result['Loop distal junction distance'] = get_junction_distance(distal, boi_dist, effective_bulge_size_in_Hit_vicinity_regions, effective_internal_loop_size_in_Hit_vicinity_regions)            
            result['Loop proximal junction distance'] = get_junction_distance(proximal, psep_dist, effective_bulge_size_in_Hit_vicinity_regions, effective_internal_loop_size_in_Hit_vicinity_regions)            
        else:
            result['message'] = "immediate branching"                        
    return pd.Series(result)

In [502]:
def run(tag, path, extra):        
    return get_row(tag, path, extra)
    try:
        return get_row(tag, path,extra)
    except Exception as e:
        print(str(e), tag)        
        return pd.Series()
        
def get_df_by_tag(tag , extra=0):           
    ct_files = glob.glob(f'{base}{reformat(tag)}/SEQ_*.ct')    
    return pd.Series(ct_files).apply(lambda path: run(tag, path,extra))    

## apply on current data

In [503]:
#get_df_by_tag(df['tag'].iloc[8])#['ct'][1]
#get_df_by_tag("AMWY02000161.1|-|1642-2061|201-220", extra=0).to_csv('Result/d.csv')

In [505]:
get_df_by_tag("AMWY02000014.1|+|194-614|201-221", extra=0).iloc[0,:]['full seq visualization']

'http://nibiru.tbi.univie.ac.at/forna/forna.html?id=fasta&file=%3Eheader%5CnaaaaattttgtctccatatgtttgggttcatttttaaaaaattgggctccagtccacttaaatctaatgcaaaatatttgtattgaagtatgaaatatatttatggctcaaacaaaggaatatcaaatttatttaattatatataaatttttgaaatatgacattaaattaaaatggtctatttcaaatgtcatgtataaTTAATATTATTAAATTTGTTGcctattttttacaatattaattgtaaaatattttcccatataaaatttgccataacaaattgttaaaagcttttacaacataaaattttttaattaattgtaaaatattttcaacataccttttttactattaattgtagaaaataatagcaaatttgacaatgttattatacaatttttaaatagacgattttaattaa%5Cn........(((((((...((((((((.((((.(((((.....(((((....))))).)))))..(((((((((...)))))))))(((((((...)))))))))))))))))))..)))((.(((((...(((((........))))).))))).))..))))...(((((((((.((((((((.(((.....((((((((.((((((.((((((((((((..((((((((((((..(((((.((((((.(((((.......)))))(((......)))..((((((((..((((((((..((((....)))).....))))))))..)))).))))......)))))).))))))))))))))))).)))))))))))).))))))))))))))..))).)))))))).)))))))))..&colors=%3Eheader%5Cnrange%5C%3Dwhite:blue%5Cn0%5Cn0%5Cn0%5Cn0%5Cn0%5Cn0%5Cn0%

In [491]:
dfs = []
max_workers = mp.cpu_count() - 1
#max_workers = 1
for d in process_map(get_df_by_tag , df['tag'], tqdm_class=tqdm, max_workers=max_workers, chunksize=5):
    dfs.append(d)
df_result = pd.concat(dfs,axis=0)
#!zip -r Result/ct_analizer_result.csv.zip ./Result/ct_analizer_result.csv

  0%|          | 0/2000 [00:00<?, ?it/s]

TypeError: visualization() missing 2 required positional arguments: 'star_start_real' and 'start_end_real'

In [400]:
# clustr to cluster members cdhits
seq2cluster = pd.read_csv("Temp/seq2cluster.csv")
seq2cluster['tag'] = seq2cluster.groupby(['cluster'])['tag'].transform(lambda x: ','.join(x))
seq2cluster['seqid'] = seq2cluster.groupby(['cluster'])['seqid'].transform(lambda x: ','.join(x))
seq2cluster = seq2cluster.drop_duplicates()

tag2cluster = pd.read_csv('./Temp/pipe_seprated_location_list.csv',sep='\t')
tag2cluster['location_tag'] = tag2cluster['location_tag'].apply(lambda x : x[1:])

data = pd.merge(seq2cluster,tag2cluster,how='inner', left_on='cluster', right_on='qseqid')
data['Reference miRNA cluster'] = data['cluster']
data['Reference miRNA IDs'] = data['seqid']
data['Reference miRNA IDs and species'] = data['tag']
data = data[['location_tag','Reference miRNA cluster', 'Reference miRNA IDs', 'Reference miRNA IDs and species']]

In [401]:
df_result = pd.merge(data, df_result, how='right', left_on = 'location_tag', right_on ='seq name')
del df_result['location_tag']

In [402]:
orders = ["seq name", "ct name", "ct", "pdf", "Reference miRNA cluster", 'Reference miRNA IDs','Reference miRNA IDs and species']
columns_titles = [*orders, *[c for c in df_result.columns if c not in orders]]
df_result = df_result.reindex(columns=columns_titles)

In [403]:
df_result = df_result.replace(np.nan, '-').replace('', '-')
df_result.to_csv("Result/ct_analizer_result_extra=0.csv", index=False)

In [404]:
# Todo: add seq to file

In [405]:
# Todo: is high confidence

# Filters

## Level 1

In [406]:
level1 = df_result[df_result['message'] == '-']
level1 = level1[level1['star branching'] != 'yes']
level1 = level1[level1['domain interfering structures'] != 'yes']
level1 = level1[level1['domain* interfering structures'] != 'yes']
level1 = level1[level1['number of terminal structures'] != "no residues between miR and miR*"]                                                         
level1.to_csv("Result/result_level1_filter.csv", index=False)

## Level 2

In [407]:
DELTA_G_MIN = -1000
DELTA_G_MAX = 0
HIT_LEN_MIN = 16
HIT_LEN_MAX = 25
HIT_COMPLEMENTARITY_PERCENTAGE_MIN = 0.3 
HIT_COMPLEMENTARITY_PERCENTAGE_MAX = 1
NUMBER_OF_TERMINAL_STRUCTURE_MIN = 1
NUMBER_OF_TERMINAL_STRUCTURE_MAX = 5
BOI_GC_CONTENT_MIN = 0.01 
BOI_GC_CONTENT_MAX = 1
# PRECURSOR_GC_CONTENT_MIN
# PRECURSOR_GC_CONTENT_MAX
BORDER_LINE_MISMATCH_MAX = 3
BORDER_LINE_BULGE_MAX = 3
BORDER_LINE_INTERNAL_MAX = 3
TOTAL_NUM_OF_NONMATCHING_POSITIONS = 5
TOTAL_NUM_OF_MISMACHED_POSITIONS = 4
TOTAL_NUM_OF_POSITIONS_IN_BULGES_AND_LOOPS = 3
MAX_ALLOWED_BULGE_SIZE_IN_HIT_REGION = 2
MAX_ALLOWED_INTERNAL_LOOP_SIZE_IN_HIT_REGION = 3
MINIMUM_REQUIRED_CLEAR_REGION = 8
ACCEPTABLE_NUM_FOR_UNMATCHED_LOCATIONS_IN_HIT_REGION = 5
ACCEPTABLE_NUM_FOR_HIT_LOCATIONS_IN_BULGES_OR_LOOPS = 3

In [408]:
def is_allowed(row, type_str, size_str, limmit):
    mismatch_type = row[type_str]
    for i in range(len(mismatch_type)):
        if(mismatch_type[i] == "hit region" ):
            if(row[size_str][i] > limmit):
                return False
    return True

In [409]:
level1['delta G'] = level1['delta G'].apply(lambda x : float(x))
level2 = level1[level1['delta G'] >= DELTA_G_MIN]
print(level2.shape)
level2 = level2[level2['delta G'] <= DELTA_G_MAX]
print(level2.shape)
level2 = level2[level2['hit len'] >= HIT_LEN_MIN]
print(level2.shape)
level2 = level2[level2['hit len'] <= HIT_LEN_MAX]
print(level2.shape)
level2 = level2[level2['hit complementarity percentage'] >= HIT_COMPLEMENTARITY_PERCENTAGE_MIN]
print(level2.shape)
level2 = level2[level2['hit complementarity percentage'] <= HIT_COMPLEMENTARITY_PERCENTAGE_MAX]
print(level2.shape)
level2 = level2[level2['number of terminal structures'] >= NUMBER_OF_TERMINAL_STRUCTURE_MIN]
print(level2.shape)
level2 = level2[level2['number of terminal structures'] <= NUMBER_OF_TERMINAL_STRUCTURE_MAX]
print(level2.shape)
level2 = level2[level2['boi GC content'] >= BOI_GC_CONTENT_MIN]
print(level2.shape)
level2 = level2[level2['boi GC content'] <= BOI_GC_CONTENT_MAX]
print(level2.shape)

(1383, 121)
(1383, 121)
(1367, 121)
(1367, 121)
(1341, 121)
(1341, 121)
(1341, 121)
(1338, 121)
(1338, 121)
(1338, 121)


In [410]:
level2 = level2[level2.apply(lambda row: is_allowed(row,\
                                                       "bulge type",\
                                                       "bulge size",\
                                                       MAX_ALLOWED_BULGE_SIZE_IN_HIT_REGION),axis=1)]
print(level2.shape)

(1229, 121)


In [411]:
level2 = level2[level2.apply(lambda row: is_allowed(row,\
                                                       "internal type",\
                                                       "internal loop total size",\
                                                       MAX_ALLOWED_BULGE_SIZE_IN_HIT_REGION),axis=1)]
print(level2.shape)

(415, 121)


In [412]:
def is_allowed_clear(row):
    if(row['Loop distal junction distance'] >=  MINIMUM_REQUIRED_CLEAR_REGION or 
      row['Loop proximal junction distance'] >=  MINIMUM_REQUIRED_CLEAR_REGION):
        return True
    return False
    
level2 = level2[level2.apply(lambda row: is_allowed_clear(row),axis=1)]
print(level2.shape)

(240, 121)


In [413]:
def check_border_line(row,type_str, size_str, limmit):
    valid = True
    mismatch_type = row[type_str]
    for i in range(len(mismatch_type)):
        if(mismatch_type[i] == "distal border line" or mismatch_type == "proximal border line"):            
            if(row[size_str][i] > limmit):               
                valid = False
    return valid

In [414]:
level2 = level2[level2.apply(lambda row: check_border_line(row,\
                                                       "mismatch type",\
                                                       "mismatch size",\
                                                       BORDER_LINE_MISMATCH_MAX),axis=1)]
print(level2.shape)
level2 = level2[level2.apply(lambda row: check_border_line(row,\
                                                       "bulge type",\
                                                       "bulge size",\
                                                       BORDER_LINE_BULGE_MAX),axis=1)]
print(level2.shape)
level2 = level2[level2.apply(lambda row: check_border_line(row,\
                                                           "internal type",\
                                                           "internal loop total size",\
                                                           BORDER_LINE_INTERNAL_MAX),axis=1)]
print(level2.shape)

(240, 121)
(236, 121)
(209, 121)


In [415]:
def sum_of_size_in_hit(row, type_str, size_str):
    _sum = 0
    mismatch_type = row[type_str]
    for i in range(len(mismatch_type)):
        if(mismatch_type[i] == "hit region" ):
            _sum += row[size_str][i]
    return _sum 

sum_missmatch = level2.apply(lambda row: sum_of_size_in_hit(row, 'mismatch type', 'mismatch size'), axis=1)
sum_bulge = level2.apply(lambda row: sum_of_size_in_hit(row, 'bulge type', 'bulge size'), axis=1)
sum_internal = level2.apply(lambda row: sum_of_size_in_hit(row, 'internal type', 'internal loop total size'), axis=1)

In [416]:
def sum_of_size_in_border_line(row, border_type, type_str, size_str, start, end):
    _sum = 0
    _size = row[size_str]
    _start = row[start]
    _end = row[end]
    mir_type = row['mir type']
    mismatch_type = row[type_str]
    for i in range(len(mismatch_type)):
        if(mismatch_type[i] == border_type):
            if(border_type == "distal border line"):
                if(mir_type  == '5p'):
                    _sum += _size[i] - _start[i]
                if(mir_type  == '3p'):
                    _sum += _size[i] - _end[i]
            if(border_type == "proximal border line"):
                if(mir_type  == '5p'):
                    _sum += _size[i] - _end[i]
                if(mir_type  == '3p'):
                    _sum += _size[i] - _start[i]
    return _sum 

sum_missmatch_border_proximal = level2.apply(lambda row: sum_of_size_in_border_line(row,\
                                                                                    'proximal border line',\
                                                                                    'mismatch type',\
                                                                                    'mismatch size',\
                                                                                    'mismatch start',\
                                                                                    'mismatch end'), axis=1)
sum_missmatch_border_distal = level2.apply(lambda row: sum_of_size_in_border_line(row,\
                                                                                  'distal border line',\
                                                                                  'mismatch type',\
                                                                                  'mismatch size',\
                                                                                    'mismatch start',\
                                                                                    'mismatch end'), axis=1)
sum_bulge_border_proximal = level2.apply(lambda row: sum_of_size_in_border_line(row,\
                                                                                'proximal border line',\
                                                                                'bulge type',\
                                                                                'bulge size',\
                                                                                    'bulge start',\
                                                                                    'bulge end'), axis=1)
sum_bulge_border_distal = level2.apply(lambda row: sum_of_size_in_border_line(row,\
                                                                              'distal border line',\
                                                                              'bulge type',\
                                                                              'bulge size',\
                                                                                    'bulge start',\
                                                                                    'bulge end'), axis=1)
sum_internal_border_proximal = level2.apply(lambda row: sum_of_size_in_border_line(row,\
                                                                                   'proximal border line',\
                                                                                   'internal type',\
                                                                                   'internal loop total size',\
                                                                                    'internal start',\
                                                                                    'internal end'), axis=1)
sum_internal_border_distal = level2.apply(lambda row: sum_of_size_in_border_line(row,\
                                                                                 'distal border line',\
                                                                                 'internal type',\
                                                                                 'internal loop total size',\
                                                                                    'internal start',\
                                                                                    'internal end'), axis=1)

In [417]:
def number_of_residue(row):
    hit_end = row['hit end']
    hit_start = row['hit start']
    psep = row['psep']    
    mir_type = row['mir type']
    if( mir_type == '5p'):
        if(psep < hit_end):
            return hit_end - psep        
    if(mir_type == '3p'):
        if(psep > hit_start ):
            return psep - hit_start
    return 0


sum_of_residue = level2.apply(lambda row: number_of_residue(row), axis=1)

In [418]:
_sum = sum_bulge + sum_internal + sum_missmatch_border_distal * 2 + sum_bulge_border_proximal  + sum_bulge_border_distal + sum_internal_border_proximal + sum_internal_border_distal + sum_of_residue 
level2 = level2[_sum <= ACCEPTABLE_NUM_FOR_HIT_LOCATIONS_IN_BULGES_OR_LOOPS]
print(level2.shape)
_sum += sum_missmatch * 2 + sum_missmatch_border_proximal * 2
level2 = level2[_sum <= ACCEPTABLE_NUM_FOR_UNMATCHED_LOCATIONS_IN_HIT_REGION]
print(level2.shape)

(168, 121)
(80, 121)


  level2 = level2[_sum <= ACCEPTABLE_NUM_FOR_UNMATCHED_LOCATIONS_IN_HIT_REGION]


In [419]:
level2 = level2[(sum_missmatch + sum_bulge + sum_internal) <= TOTAL_NUM_OF_NONMATCHING_POSITIONS]
print(level2.shape)
level2 = level2[sum_missmatch <= TOTAL_NUM_OF_MISMACHED_POSITIONS]
print(level2.shape)
level2 = level2[(sum_bulge + sum_internal) <= TOTAL_NUM_OF_POSITIONS_IN_BULGES_AND_LOOPS]
print(level2.shape)

(80, 121)
(80, 121)
(80, 121)


  level2 = level2[(sum_missmatch + sum_bulge + sum_internal) <= TOTAL_NUM_OF_NONMATCHING_POSITIONS]
  level2 = level2[sum_missmatch <= TOTAL_NUM_OF_MISMACHED_POSITIONS]
  level2 = level2[(sum_bulge + sum_internal) <= TOTAL_NUM_OF_POSITIONS_IN_BULGES_AND_LOOPS]


In [420]:
level2.to_csv("Result/result_level2_filter.csv", index=False)
level1[~level1.index.isin(level2.index)].to_csv("Result/result_level2_filter_deleted.csv", index=False)

# psRNATarget

In [153]:
url = "https://www.zhaolab.org/psRNATarget/analysis"

payload='allowbulge=yes&curschema=s2&cutpos1=10&cutpos2=11&expect=5&function=3&gapextp=0.5&gapstartp=2&gup=0.5&hspsize=19&maxnummismatchinseed=2&misp=1&seedfactor=1.5&seedpos1=2&seedpos2=13&srna=&srna_content=%3Eath-miR156a%0D%0AUGACAGAAGAGAGUGAGCAC%0D%0A%3Eath-miR157a%0D%0AUUGACAGAAGAUAGAGAGCAC%0D%0A%3Eath-miR158a%0D%0AUCCCAAAUGUAGACAAAGCA%0D%0A%3Eath-miR398a%0D%0AUGUGUUCUCAGGUCACCCCUU%0D%0A%3Eath-miR398b%0D%0AUGUGUUCUCAGGUCACCCCUG%0D%0A%3Eath-miR398c%0D%0AUGUGUUCUCAGGUCACCCCUG%0D%0A%3Eath-miR834%0D%0AUGGUAGCAGUAGCGGUGGUAA%0D%0A%3Eath-miR390a%0D%0AAAGCUCAGGAGGGAUAGCGCC%0D%0A%3Eath-miR390b%0D%0AAAGCUCAGGAGGGAUAGCGCC&srna_uploaded=&target=&target_content=%3EAT1G27360.1%0D%0AAAGGTATCTATTTGCCTAGCCAGAGTTATATATAGGATTGATTGTCTAGTCTTTTCTTATATGATTTTTGTTCTCATTTACTAATCAAAGTTCTGCAAACTTGTAGTTGTTGTAGGATTTGTTGCTCTGGCTCTGGTGGTAGGTCTATGAAATCAACCCATATCGTGAATGGACTGCAACATGGTATCTTCGTCCCAGTGGGATTGGGAGCATTTGATCATGTCCAATCCGTCAAGGACTGAAGATGACAGCAAACAGCTACCTACTGAGTGGGAAATTGAAAAAGGTGAAGGAATTGAATCTATAGTTCCACATTTCTCAGGCCTTGAGAGAGTCAGTAGTGGCTCTGCCACCAGCTTCTGGCACACTGCTGTATCGAAAAGCTCACAGTCGACCTCTATCAACTCATCATCTCCCGAAGCCAAACGATGCAAGCTTGCATCAGAAAGTTCCCCTGGAGATTCTTGCAGCAACATAGACTTTGTCCAGGTGAAGGCTCCCACAGCTCTCGAGGTATCCGTTGCCTCAGCTGAATCAGATCTTTGTTTAAAACTAGGAAAGCGGACATACTCTGAAGAATACTGGGGTAGAAACAATAATGAAATTTCAGCGGTTTCTATGAAGTTGTTAACTCCATCTGTTGTCGCTGGGAAATCCAAATTGTGTGGTCAGAGCATGCCAGTCCCGCGTTGCCAAATTGATGGCTGTGAACTGGATCTCTCATCTGCTAAGGGTTATCATCGTAAGCACAAAGTCTGCGAAAAGCATTCAAAGTGCCCAAAAGTTAGCGTGAGTGGCCTGGAACGTCGGTTCTGCCAACAGTGTAGCAGGTTCCATGCTGTCTCTGAATTTGATGAGAAGAAACGAAGCTGCCGAAAACGTCTTTCTCATCATAATGCGAGGCGTCGTAAGCCACAAGGAGTATTTTCAATGAATCCCGAGAGGGTGTATGATCGAAGACAGCATACAAATATGTTGTGGAATGGGGTGTCCCTTAACGCGAGATCTGAAGAAATGTATGAATGGGGTAATAACACTTATGATACAAAGCCTAGACAAACGGAAAAAAGCTTTACTCTGAGCTTCCAGAGAGGTAATGGCTCTGAGGACCAGCTGGTTGCTAGTAGCAGCCGTATGTTCTCTACATCTCAAACCTCAGGTGGGTTCCCAGCAGGAAAGTCCAAGTTTCAACTTCATGGCGAAGATGTGGGAGAATACTCAGGAGTCCTCCATGAATCTCAAGATATCCACCGTGCTCTCTCTCTTCTGTCAACCTCTTCGGATCCCCTGGCCCAACCACATGTGCAGCCATTTTCTCTACTCTGTTCATATGATGTTGTACCAAAATAGATGAGTAAGTAATGTGTAATTTGTAAACCTGTTACTCAGTTGGTGGATACTTTTCCAAACCTATGATAAAAACCTCGTCCTAGATCCCGTTAAATGCCAAACTTTCGGCTACTATAACTATGTTATCGTTATCATTATCATTGTTTAACACCCT%0D%0A%3EAT1G27360.4%20%7C%20Symbols%3A%20%20%7C%20squamosa%20promoter-binding%20protein-like%2011%20(SPL11)%20%7C%20chr1%3A9501808-9503856%20FORWARD%0D%0ACTGGGTGAAACATAGAAAAGTTTCTCTTGCTCAAGTTAATGATAAAAGGGTGAGAGCAATAAACGCTGATAAGCCTTGTCTGGTCCTTGGAATTTTGAATTTTCTTTTTCTATCTTACTTATAGTATTGGTAGTTGAGGGTGTCGTCGATAAGTTGTTGTAGGATTTGTTGCTCTGGCTCTGGTGGTAGGTCTATGAAATCAACCCATATCGTGAATGGACTGCAACATGGTATCTTCGTCCCAGTGGGATTGGGAGCATTTGATCATGTCCAATCCGTCAAGGACTGAAGATGACAGCAAACAGCTACCTACTGAGTGGGAAATTGAAAAAGGTGAAGGAATTGAATCTATAGTTCCACATTTCTCAGGCCTTGAGAGAGTCAGTAGTGGCTCTGCCACCAGCTTCTGGCACACTGCTGTATCGAAAAGCTCACAGTCGACCTCTATCAACTCATCATCTCCCGAAGCCAAACGATGCAAGCTTGCATCAGAAAGTTCCCCTGGAGATTCTTGCAGCAACATAGACTTTGTCCAGGTGAAGGCTCCCACAGCTCTCGAGGTATCCGTTGCCTCAGCTGAATCAGATCTTTGTTTAAAACTAGGAAAGCGGACATACTCTGAAGAATACTGGGGTAGAAACAATAATGAAATTTCAGCGGTTTCTATGAAGTTGTTAACTCCATCTGTTGTCGCTGGGAAATCCAAATTGTGTGGTCAGAGCATGCCAGTCCCGCGTTGCCAAATTGATGGCTGTGAACTGGATCTCTCATCTGCTAAGGGTTATCATCGTAAGCACAAAGTCTGCGAAAAGCATTCAAAGTGCCCAAAAGTTAGCGTGAGTGGCCTGGAACGTCGGTTCTGCCAACAGTGTAGCAGGTTCCATGCTGTCTCTGAATTTGATGAGAAGAAACGAAGCTGCCGAAAACGTCTTTCTCATCATAATGCGAGGCGTCGTAAGCCACAAGGAGTATTTTCAATGAATCCCGAGAGGGTGTATGATCGAAGACAGCATACAAATATGTTGTGGAATGGGGTGTCCCTTAACGCGAGATCTGAAGAAATGTATGAATGGGGTAATAACACTTATGATACAAAGCCTAGACAAACGGAAAAAAGCTTTACTCTGAGCTTCCAGAGAGGTAATGGCTCTGAGGACCAGCTGGTTGCTAGTAGCAGCCGTATGTTCTCTACATCTCAAACCTCAGGTGGGTTCCCAGCAGGAAAGTCCAAGTTTCAACTTCATGGCGAAGATGTGGGAGAATACTCAGGAGTCCTCCATGAATCTCAAGATATCCACCGTGCTCTCTCTCTTCTGTCAACCTCTTCGGATCCCCTGGCCCAACCACATGTGCAGCCATTTTCTCTACTCTGTTCATATGATGTTGTACCAAAATAGATGAGTAAGTAATGTGTAATTTGTAAACCTGTTACTCAGTTGGTGGATACTTTTCCAAACCTATGATAAAAACCTCGTCCTAGATCCCGTTAAATGCCAAACTTTCGGCTACTATAACTATGTTATCGTTATCATTATCATTGTTTAACACCCT%0D%0A%3EAT1G32140.1%20%7C%20Symbols%3A%20%20%7C%20F-box%20family%20protein%20%7C%20chr1%3A11562722-11564813%20REVERSE%0D%0AATGACGATGATGTCCGACCTTTCACTTGATTTAGTCGAAGAGATATTGTGTAGGGTTCCGATAACTTCTCTTAAAGCAGTGAGATCTAGTTGCAAACTATGGAACGTTCTTTCCAAGAACCGGATTTTATGTAAAACAGAAGCTAGAAATCAGTTTTTAGGGTTCACGATAATGAATCATAGGCTTTATTCCATGAGATTCAATCTCCATGGAATCGGCCTCAATGAAAACAGTGAAGAGTTCATTGATCCATCTATAAAGCCAATAGGTAATTTACTTAATCAAGTCGAGATATCTAAAGTGTTTTATTGCGAAGGTTTATTGTTATGCGTCACAAGGAACCACTCAAGCAAGCTCGTAGTTTGGAACCCGTATTTGGGAGAAATTCGTTGGATCAAAACTAGGAATGATTACCACATAGGCGTTACATATGCTCTCGGGTACGACAACAACAAGAACCACATGATCTTGAGGTTTTTTTCTGAACAAGGCTACTACGAGATTTACGACATGAACTCTTCTGACTCATGGGATTGTTTTTATGGCATTCCCAACAAGGGGTTAAAATGTTATCAGCCCGGCGCGTCGTTAAATGGAAATGCTTATTTTTTGACTGAGGGAAGAGAAGTAATGGAAGGGTATGATTGCTTACTCGGTTTTGATTTTACAACAAAGAAATTTGGACCACTTCTTTCTTTGTCGTTTTCGCATGATTTTATAGAGACTGGGAGACTATCTTGTGTTAAAGGAGAGAAACTTGCGGTCTTATATCAGCGCTGCTATACCTATGAGATGGAGGTATGGGTGACAACTAAGATAGAGCCGAATGCGGTGTCATGGAGCAAATTCTTAGCAGTTGAAATGGAACCACTCACTAGTCTAAAGTTTAACGATGATTCTGGCAGTTTCTTCATTGACGAAGAGAAGAAAATCGTCGTGGTTTTTGATATAGACGAATCTGAACGCAACAATACGGCTTACATCATTGGAGATTATGGATGCTTGAAAGAAGTTGATCTTGATGAAGTTGTGAACCCACAAGAATCTGTGGAGGTCGGAGACCGCATTTATTCTTTTTCACCATTTGTGTGCTCTTGCTCTTATGTTCCAAGTTTAGTGAAATTTAAAGAAGATGCAGAACATGAAAGGAAAGATAAGAAGAGGAAGAGTAAGAGGAAGCGAACCAACAAGGATGGATATGATTTTATACTGTGTTTTGATTTTACAACCGAGAGATTTGGACAGATTCTTCCTCTGCCGTTTAAACATTCTTTTAGGGATACTTGGACTCTATCTTCTGTTAAAGAAGAGAAACTTGCAGTCGCAGTGTTATACTGGAAAAATACATGTGTGATGATAGAGATATGGATGACAATTAAGATTGATCCTAATGTCGAGTCGTGGAGCAAATTCTTAAGAGTTGATAGGAAACCATGCATTGATCTTCGCTTTGATGATCGTAATGACAGTTTTTTCATTGACGAAGAGAAGAAAGTTGTCGTGTTTTTTAGTTCAGACAAAGTTAAAACCTCTACGGCTTACGTCATTGGAGATAATAGATATTTGAGAACAGTGGATCTTGAAAAAGCTGCAAACTCCCAAGAATCTGTGGAGGTCGGAGAACGCGTGTATTGTTTTTCGCCGCTTGTGTGCTCTTGCTCTTATTATGTTCCAAGTTTGGTGAAAATCAACCACAATGCAGGACGCAAAAGGAAAGAGAAGAAGACGAAGCGCAAAAGTAAAGACAAGCAGATGAAACTAAGCAACAAGGTGTAA%0D%0A%3EAT2G42200.1%20%7C%20Symbols%3A%20%20%7C%20squamosa%20promoter-binding%20protein-like%209%20(SPL9)%20%7C%20chr2%3A17594485-17596708%20FORWARD%0D%0AACCACTCTCGTCTCTTTCTTTTTTCCTTCTGTTCTGTTTCTCTCTCTAAACCCAAAACAGTCAAAATCAGGGAAGCCGAAATTTTCTTTGCTTTCTTCTCCTTTGGTCCTTTCTTTAAACCCGAGACAGTTAGGTTTGTGTGAGAGAGAGAATGATGAGTAAAACCCTTTCTGTCTGAGTAAGAGGAAACCAACATGGAGATGGGTTCCAACTCGGGTCCGGGTCATGGTCCGGGTCAGGCAGAGTCGGGTGGTTCCTCCACTGAGTCATCCTCTTTCAGTGGAGGGCTCATGTTTGGCCAGAAGATCTACTTCGAGGACGGTGGTGGTGGATCCGGGTCTTCTTCCTCAGGTGGTCGTTCAAACAGACGTGTCCGTGGAGGCGGGTCGGGTCAGTCGGGTCAGATACCAAGGTGCCAAGTGGAAGGTTGTGGGATGGATCTAACCAATGCAAAAGGTTATTACTCGAGACACCGAGTTTGTGGAGTGCACTCTAAAACACCTAAAGTCACTGTGGCTGGTATCGAACAGAGGTTTTGTCAACAGTGCAGCAGGTTTCATCAGCTTCCGGAATTTGACCTAGAGAAAAGGAGTTGCCGCAGGAGACTCGCTGGTCATAATGAGCGACGAAGGAAGCCACAGCCTGCGTCTCTCTCTGTGTTAGCTTCTCGTTACGGGAGGATCGCACCTTCGCTTTACGAAAATGGTGATGCTGGAATGAATGGAAGCTTTCTTGGGAACCAAGAGATAGGATGGCCAAGTTCAAGAACATTGGATACAAGAGTGATGAGGCGGCCAGTGTCGTCACCGTCATGGCAGATCAATCCAATGAATGTATTTAGTCAAGGTTCAGTTGGTGGAGGAGGGACAAGCTTCTCATCTCCAGAGATTATGGACACTAAACTAGAGAGCTACAAGGGAATTGGCGACTCAAACTGTGCTCTCTCTCTTCTGTCAAATCCACATCAACCACATGACAACAACAACAACAACAACAACAACAACAACAACAACAACAATACATGGCGAGCTTCTTCAGGTTTTGGCCCGATGACGGTTACAATGGCTCAACCACCACCTGCACCTAGCCAGCATCAGTATCTGAACCCGCCTTGGGTATTCAAGGACAATGATAATGATATGTCTCCTGTTTTGAATTTAGGTCGATACACCGAGCCAGATAATTGTCAGATAAGTAGTGGCACGGCAATGGGTGAGTTCGAGTTATCTGATCACCATCATCAAAGTAGGAGACAGTACATGGAAGATGAGAACACAAGGGCTTATGACTCTTCTTCTCACCATACCAACTGGTCTCTCTGACTTGTCTTTGCATCAGAGAATCTTCTTACAATGAACGATTCTGCAATATCTTATCTTTTTGCTTCTTTGTTTATTCTGTTATCTGCTATCAATAAACCAGACAATTGTTGCCAGATAATGGCTTTTGATTTTGATTTGTTGTTTTATCTCCATGAAAATCCAAGTTATGAGATCAGATT%0D%0A%3EAT1G49910.1%20%7C%20Symbols%3A%20%20%7C%20WD-40%20repeat%20family%20protein%20%2F%20mitotic%20checkpoint%20protein%2C%20putative%20%7C%20chr1%3A18482693-18485143%20FORWARD%0D%0AATGACTTTGGTGCCGGCCATTGGTCGCGAGCTCTCGAATCCACCGTCCGATGGGATTTCTAATCTTCGATTTTCTAATAACAGCGATCATTTACTAGTCTCTTCATGGGATAAGAGTGTAAGATTGTATGATGCGAACGGCGATTTGATGAGAGGGGAGTTTAAACATGGTGGAGCGGTACTCGATTGCTGCTTCCATGATGATTCTTCTGGATTCAGTGTTTGCGCCGATACTAAAGTTAGAAGAATTGACTTCAATGCTGGCAAAGAAGACGTTTTGGGTACGCATGAGAAGCCAGTTCGATGTGTTGAGTATTCTTATGCTGCAGGGCAAGTGATCACTGGAAGTTGGGATAAAACGATTAAATGTTGGGATCCAAGAGGTGCAAGTGGGACTGAGCGCACACAGATTGGAACTTATATGCAACCTGAGCGTGTTAACTCTCTTTCTCTTGTTGGAAATCGTTTGGTAGTGGCAACAGCAGGAAGGCATGTCAACATTTATGATCTTAGAAATATGTCCCAGCCTGAGCAAAGAAGAGAGTCCTCACTTAAATACCAGACAAGATGTGTACGTTGTTATCCCAACGGAACAGGATATGCCCTTAGCTCTGTTGAAGGGAGGGTTTCAATGGAGTTTTTTGATCTATCAGAAGCTGCTCAGGCTAAAAAATATGCTTTCAAATGTCACCGGAAATCAGAGGATGGAAGGGACATTGTCTACCCTGTAAATGCAATTGCTTTCCATCCGATTTATGGCACTTTTGCTTCCGGAGGCTGTGATGGTTTTGTCAACATTTGGGACGGTAACAATAAGAAGAGGCTTTATCAGTACTCTAAGTATCCAACAAGTATTGCGGCGCTGTCATTCAGCCGAGATGGTGGATTACTGGCTGTTGCTTCTAGTTACACGTTTGAAGAGGGAGACAAACCGCATGAACCGGACGCCATCTTTGTTAGAAGTGTTAATGAAATTGAAGTGAAACCGAAACCCAAAGTATACCCAAATCCCCCGGTATAGTCAAGAAATAATGGAATGAGCAGAGTCAAATTCGACTTGTGTGTTGTTGTATTGTAGCACTTGAAAGTGAGTTATAAAATCTTATTTTGGCTGTAAAGTGAAATGTGAACGTTATAATGGCTTTCGAATCTGAGATGGTGTTCCATTTACTCTCTGGGTTGCCTCCAATTTTTCTTTTAGGACCAACTATCTTATTTTTACCTT%0D%0A%3EAT1G48460.1%20%7C%20Symbols%3A%20%20%7C%20similar%20to%20unknown%20protein%20%5BArabidopsis%20thaliana%5D%20(TAIR%3AAT5G63040.2)%3B%20similar%20to%20Os01g0704200%20%5BOryza%20sativa%20(japonica%20cultivar-group)%5D%20(GB%3ANP_001044004.1)%3B%20similar%20to%20hypothetical%20protein%20MtrDRAFT_AC124952g33v1%20%5BMedicago%20truncatula%5D%20(GB%3AABE93586.1)%3B%20contains%20domain%20Multidrug%20resistance%20ABC%20transporter%20MsbA%2C%20N-terminal%20domain%20(SSF90123)%20%7C%20chr1%3A17915014-17916968%20FORWARD%0D%0AAGAAAACAATTCCAAAAAATAAAATGTCAGAAAAGAATTTTCTTTTAGAATAAAGACAGTGAAGAGATTTATTTCAAAGCCTGGGTTTAAGCTGCTGAGAGAACACAAAAAACCCTAACAAAAATGGAATCGAAAGCAATTTGCTTAGGGTTTCTTCCTCCAAGACTTCGATTTTCATCTCCACGTTTACTCTCTCTTCCTCCTTCTCCTCCTGCTTCTTCCACATTTGCGACGCGTCACAAACTTGATTCCAGACAAACCCTCCTTTGGAACAAACCGCAATTGAGCCGAGTTCGTGTAGCGTGTTCTTCTTCTCAATCTGACTCAAGACCTGAGAAGAAGCAATCGGATAAGAGTAACTATGCTCGAGCTGAGCTGTTCCGTGGGAAATCAGGTTCTGTTTCTTTCAATGGTCTGACTCATCAGCTGGTTGAAGAAAGTAAACTGGTTTCAGCTCCGTTTCAAGAAGAGAAAGGTTCTTTCTTGTGGGTTTTGGCTCCTGTTGTTTTGATTTCTTCGTTGATTCTTCCTCAGTTCTTTCTAAGTGGTATCATTGAAGCTACCTTCAAAAACGACACTGTTGCTGAAATTGTTACTTCTTTTTGCTTTGAGACGGTGTTTTATGCTGGTCTTGCGATATTCCTGTCTGTGACTGACCGAGTGCAGAGGCCGTACTTAGACTTCAGCTCCAAGAGATGGGGTCTGATCACTGGACTGAGGGGATACCTTACGTCTGCATTCCTCACGATGGGTTTAAAAGTTGTAGTTCCCGTATTTGCTGTTTACATGACTTGGCCAGCTCTTGGAATAGATGCTTTGATTGCAGTGCTTCCTTTCTTGGTTGGCTGTGCAGTTCAAAGAGTTTTCGAGGCTCGGCTTGAAAGACGTGGCTCATCCTGTTGGCCCATTGTTCCAATAGTCTTTGAGGTGTATAGGCTGTATCAGGTGACAAGAGCAGCGACTTTTGTTCAGAGGCTGATGTTTATGATGAAAGATGCGGCAACGACTGCTGAAATAACAGAGCGAGGAGTTGCACTAGTTGGTTTGGTTGTGACTTTGCAGTTTCTAGCTGTTATGTGTCTCTGGTCGTTTATCACTTTTCTTATGCGCCTCTTTCCTTCTAGACCTGTAGGTGAAAACTACTAGATCTCAGTGTTTAGTGATTGTTAGATGTAGCCAAATCCCATCGGTTTTGTTTTGTTTCTGTGTTCATTTCAGTAGTAATGAATTGTATTAAGTCACTTTAAGAATTGGTTGATCATGTGAAATGAGAATTGGCTGGAAATGTTATAGAACG%0D%0A%3EAT5G50570.2%20%7C%20Symbols%3A%20%20%7C%20squamosa%20promoter-binding%20protein%2C%20putative%20%7C%20chr5%3A20599309-20601106%20REVERSE%0D%0AAAAAAGGACAAATCTTGATATTGCTTTGATTGCTGTTGTGTATGTATGTGTTTTTATAGTGAGAGAAGAAAAAAAAGCACAATCTTTGAATGGACTGGAATTTCAAACTTAGCTCTGGTTATTTATCTGGATTCGATCAAGAACCAGATTTATCACCAATGGATGGTTCGATCTCGTTTGGTGGGTCGTCACAGTCAAAAGCGGATTTTTCATTTGATCTAAAACTTGGAAGAAACATTGGAAACTCTTCCTCTGTTTTTGGTGATACAGAGCAAGTGATTAGTCTTAGTAAGTGGAAAGATAGTGCTTTAGCTAAACCAGAAGGTTCAAGAAGCTCGAGTTCAAAGAGAACAAGAGGGAATGGTGTTGGAACCAACCAGATGCCGATTTGTCTTGTTGATGGATGTGATTCTGATTTTAGTAATTGTAGAGAGTATCATAAGAGACATAAAGTTTGTGATGTTCATTCAAAAACTCCTGTGGTTACTATTAATGGTCATAAACAGAGGTTTTGTCAACAATGCAGCAGGTTTCATGCTTTGGAGGAGTTTGATGAAGGGAAGAGAAGTTGTAGGAAACGTCTTGATGGACATAATCGAAGACGACGGAAGCCGCAGCCTGAACATATCGGTCGTCCTGCCAACTTCTTTACGGGTTTTCAAGGTAGCAAATTGCTAGAGTTTTCTGGTGGTTCACATGTGTTTCCAACTACATCTGTGTTGAACCCGAGCTGGGGAAATAGTCTTGTAAGCGTTGCTGTAGCCGCCAATGGTTCGAGTTATGGGCAGAGCCAGAGCTATGTTGTTGGTTCTTCTCCTGCAAAGACAGGGATAATGTTTCCAATCTCTTCTTCTCCAAACAGTACCAGAAGCATAGCAAAACAATTCCCTTTCTTGCAAGAAGAAGAAAGCTCGAGAACCGCATCGTTGTGTGAGAGAATGACGAGTTGCATCCATGACTCTGATTGTGCTCTCTCTCTTCTGTCATCCTCCTCGTCGTCAGTCCCTCATTTGCTTCAACCACCACTTTCTTTGTCCCAAGAAGCAGTTGAGACAGTTTTTTACGGGTCGGGATTGTTTGAGAATGCGAGTGCAGTCTCTGATGGATCGGTTATATCCGGTAACGAGGCTGTCCGTCTTCCGCAGACATTCCCGTTTCATTGGGAGTAGTAGAAGAAGAAGTAGGTAGATAGATAGAATCAGAAAGATCTATTTGTGTCTCTTCTCTTCTCCCTCATTTTTCAATGTTCTTTATCATCATCATTGTTCTTGTTAACACTACAAGAAATATGGACATTCTTAACACACCGAAAACGCTATAATAACGTTTACATAGCGGATTCATAAACGCTGTGTTTGCCGGAGCTATCTTAGAGTGGTCACATACAATAGCGTTTCTTGCTATGCTATTAAATGTTCACATATTATGGCGTAAAAGAAATGCTTTGATTTCCTTTGTTATTGCACAATTTTGATGTTATACTTTTGTAACTCTTTTTAAGGGCTATAAACTATTATTTTGTAGCTATATTTTATAGGCTATGATCTGATATGTTGTAGCTTTATTTTTTTGGCTATAAATCTATAAACAGCCTATAAGTTTGGGACATTTTGTTACACATTTGCAAGAAACCTCTTTTTG%0D%0A&target_uploaded=&top=200'
headers = {
  'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
  'sec-ch-ua-mobile': '?0',
  'sec-ch-ua-platform': '"Windows"',
  'Upgrade-Insecure-Requests': '1',
  'Content-Type': 'application/x-www-form-urlencoded',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  'Sec-Fetch-Site': 'same-origin',
  'Sec-Fetch-Mode': 'navigate',
  'Sec-Fetch-User': '?1',
  'Sec-Fetch-Dest': 'document',
  'Cookie': 'session=eyJzaW1wbGVwYWdlIjpmYWxzZX0.YhEQeA.N4sPemlWsAA1q7EwbfdkVXJbqs8'
}

response = requests.request("POST", url, headers=headers, data=payload)
type(response)

requests.models.Response