# Common

In [1]:
experiment = "O.sativa_Test"
input_genome_name = "GCF_001433935.1.fna"


experiment_dir = "Experiment"

input_genome_path = f'{experiment_dir}/{experiment}/{input_genome_name}'

temp_path = f"{experiment_dir}/{experiment}/Temp"
result_path = f"{experiment_dir}/{experiment}/Result"

In [2]:
import json
import time
from subprocess import Popen, PIPE, STDOUT
import math
import numpy as np
import pandas as pd
import hashlib
import requests
import os, sys, subprocess
from tqdm.contrib.concurrent import process_map
from tqdm.notebook import tqdm
tqdm.pandas()
import multiprocessing as mp
import shutil
import urllib.parse
import glob
import os
import sys
import networkx
from networkx.algorithms.clique import find_cliques as maximal_cliques
from ast import literal_eval
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
sys.path.append("./src/")
from ct_analizer import get_row
from filter1 import filter1_run
from filter2 import filter2_run

In [3]:
if(not os.path.exists(temp_path)):
    os.mkdir(temp_path)
    
if(not os.path.exists(result_path)):
    os.mkdir(result_path)

In [4]:
current_path = os.getcwd()

In [5]:
def bracket_row(row):    
    s = row['data']
    index = min(s.find('.'), s.find('('))
    data = row['data']
    row['data'] = data[0:index]
    row['bracket'] = data[index:]
    return row

In [6]:
def adjust(text,n=7):
    text = str(text)    
    return " " * (n - len(text)) + text

In [7]:
def bracket_to_ct(tag, data, bracket, deltaG, negative_deltaG=True):    
    deltaG = deltaG.replace('(','').replace(')','')
    deltaG = float(deltaG)
    if(deltaG > 0 and negative_deltaG ): # negetive?!
        deltaG = -1 * deltaG
    stack = []
    index = np.zeros((len(bracket)), dtype = int)
    values = np.zeros((len(bracket)), dtype = int)
    for i in range(len(bracket)):
        index[i] = i + 1
        if(bracket[i] == '.'):
            values[i] = 0
        elif(bracket[i] == '('):
            stack.append(i)
        elif(bracket[i] == ')'):
            if(len(stack) == 0 ):
                print('structure error!')
            values[stack[-1]] = i + 1
            values[i]  = stack[-1] + 1
            stack.pop()
        else:
            print('structure error!')
    if(len(stack) != 0 ):
        print('structure error!')
    # body    
    ct = f"{adjust(len(data),6)} dG ={adjust(deltaG,10)} {tag}\n"   
    for i in range(len(bracket)):
        ct += f"{adjust(index[i],6)} {data[i]} {adjust(i,6)} {adjust((i+2)%(len(data)+1),6)} {adjust(values[i],6)} {adjust(index[i],7)}\n"
    return ct

In [8]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [9]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [10]:
def reformat(path):
    return path.replace('(','_').replace(')','_').replace('.','').replace(':','_')

In [11]:
def reformatCT(path):
    with open(path, 'r') as file:
        text = file.read()
    text = [l for l in text.split('\n') if len(l) > 0 ] # remove blank lines
    text = '\n'.join(text)
    text = text.replace("\t"," ")
    while("  " in text):
        text = text.replace("  ", " ")
    lines = [l for l in text.split('\n')]
    for i in range(len(lines)):
        if(lines[i][0] == " "):
            lines[i] = lines[i][1:]
        if(lines[i][-1] == " "):
            lines[i] = lines[i][:-1]
    text = '\n'.join(lines)
    return text

In [12]:
def get_ct_data(ct):
    ct = "\n".join(ct.split('\n')[1:])
    df = pd.read_csv(StringIO(ct), sep=" ", header=None)               
    nucleotide = df.iloc[:,1]
    index = df.iloc[:,5]
    values = df.iloc[:,4]
    return [nucleotide, index, values]

In [13]:
def ct2dot_bracket(path):
    [nucleotide, index, values] = get_ct_data(reformatCT(path))
    text = ''.join(nucleotide) + "\n"
    watch = []
    for i, v in zip(index,values):
        if(v == 0):
            text += '.'
        else:
            if( v not in watch):
                text += '('
                watch.append(i)
            if( v in watch):
                text += ')'
    return text

In [14]:
def is_nested(index, values):
    max_value = max(index) + 10 # inf
    for i, v in zip(index, values):
        if(v < max_value and v != 0):
            max_value  = v
        if(i >= max_value):
            max_value = max(index) + 10 # inf
        if(v > max_value):
            return False               
    return True

# Download data from Mirbase

In [15]:
directory = "./miRBase_driven_data"
base = "https://www.mirbase.org/ftp/CURRENT"

In [16]:
mature = fasta_to_df(f'{directory}/mature.fa')
mature_high_conf = fasta_to_df(f'{directory}/mature_high_conf.fa')
mature['trim tag'] = mature['tag'].apply(lambda line: ' '.join(line.split(' ')[:2]))
mature['confidence'] = mature['trim tag'].isin(mature_high_conf['tag'])

In [17]:
mature['organism'] = mature['tag'].apply(lambda x: x[:3])
print(mature.shape)
mature.head(2)

(48885, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
0,cel-let-7-5p MIMAT0000001 Caenorhabditis elega...,UGAGGUAGUAGGUUGUAUAGUU,cel-let-7-5p MIMAT0000001,True,cel
1,cel-let-7-3p MIMAT0015091 Caenorhabditis elega...,CUAUGCAAUUUUCUACCUUACC,cel-let-7-3p MIMAT0015091,True,cel


In [18]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [164]:
items = list(organism['tree'].unique())
items.sort(key=len)
items

['Viruses;',
 'Mycetozoa;',
 'Alveolata;',
 'Metazoa;Porifera;',
 'Metazoa;Cnidaria;',
 'Viridiplantae;Chlorophyta;',
 'Viridiplantae;Embryophyta;',
 'Viridiplantae;Coniferophyta;',
 'Viridiplantae;Magnoliophyta;',
 'Metazoa;Bilateria;Deuterostoma;',
 'Chromalveolata;Heterokontophyta;',
 'Metazoa;Bilateria;Ecdysozoa;Nematoda;',
 'Metazoa;Bilateria;Lophotrochozoa;Annelida;',
 'Metazoa;Bilateria;Lophotrochozoa;Nemertea;',
 'Metazoa;Bilateria;Lophotrochozoa;Mollusca;',
 'Viridiplantae;Magnoliophyta;monocotyledons;',
 'Metazoa;Bilateria;Deuterostoma;Hemichordata;',
 'Metazoa;Bilateria;Deuterostoma;Echinodermata;',
 'Metazoa;Bilateria;Lophotrochozoa;Brachiopoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Hexapoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Crustacea;',
 'Metazoa;Bilateria;Lophotrochozoa;Platyhelminthes;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Chelicerata;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Mandibulata;',
 'Viridiplantae;Magnoliophyta;eudicotyledons;Poaceae;',
 'M

In [165]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" in x)]
print(selectedTree.shape)
selectedTree.head(5)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352
68,ppt,PPT,Physcomitrella patens,Viridiplantae;Embryophyta;,3218
69,smo,SMO,Selaginella moellendorffii,Viridiplantae;Embryophyta;,88036
70,ath,ATH,Arabidopsis thaliana,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3702


In [166]:
selectedTree = selectedTree[selectedTree['name'] != "Oryza sativa"]

In [167]:
selected = mature[mature['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(1)

(9676, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
316,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...,UGACAGAAGAGAGUGAGCAC,ath-miR156a-5p MIMAT0000166,False,ath


In [168]:
#selected = selected[selected['confidence']]
#selected.shape

In [169]:
df_to_fasta(selected, f"./{temp_path}/mature_microRNA_queries.fasta")

In [170]:
selected = pd.merge(selected, selectedTree, how="inner", left_on="organism", right_on="organism")
selected.shape

(9676, 9)

In [124]:
confidence = selected[selected['confidence']]['data'].apply(lambda x: x.replace("U", "T"))
confidence = set(confidence)
print(len(confidence))
confidence

63


{'AAGCTCAGGAGGGATAGCGCC',
 'AATCGACGGCCTCAGTCAGGG',
 'AGAAGCTGCAGCTGTCAGAAGCTC',
 'AGAATCTTGATGATGCTGCAT',
 'AGATCATGTTGCAGCTTCACT',
 'AGCTTCTGACAGCTGCAGTTTCTC',
 'AGGTATTGGCGTGCCTCAATC',
 'ATAGTTCAAGAAAGTCCTTGGAAA',
 'ATCATGCATGACAGCCTCATTT',
 'ATGGTTCAAGAAAGCCCATGGAAA',
 'CAGCAAGAACTGGATCTTAAT',
 'CAGGGATGAGGCAGAGCATGG',
 'CGACAGAAGAGAGTGAGCATA',
 'CGCTATCTATCCTGAGCTCC',
 'CTGCACTGCCTCTTCCCTGGC',
 'CTGGCCGAGGCCGTCGATTCT',
 'CTTCGGGGGAGGAGAGAAGC',
 'GATCCCGCCTTGCACCAAGTGAAT',
 'GCAGCACCATCAAGATTCAC',
 'GCGTGCAAGGAGCCAAGCATG',
 'GCGTGCACGGAGCCAAGCATA',
 'GCTAGAGGTGGCAACTGCATA',
 'GCTCACTCTCTATCTGTCAGC',
 'GCTCACTTCTCTCTCTGTCAGC',
 'GCTCACTTCTCTTTCTGTCAGC',
 'GCTCGCTCCTCTTTCTGTCAGC',
 'GGAATGTTGGCTGGCTCGAGG',
 'GGAATGTTGTCTGGCTCGAGG',
 'GGAATGTTGTCTGGCTCGGGG',
 'GGAATGTTGTCTGGTCCGAG',
 'GGAATGTTGTCTGGTTCAAGG',
 'GGATATTGGTGCGGTTCAATC',
 'GGATTGAGCCGCGTCAATATC',
 'GGATTGTTGTCTGGTTCAAGG',
 'GGTCAAGAAAGCTGTGGGAAG',
 'GGTTTGTTGTCTGGCTCGAGG',
 'GTAATATACTAATCCGTGCAT',
 'GTTGCACGGGTTTGTATGTTG

In [172]:
confidence = {'AAGCTCAGGAGGGATAGCGCC',
 'AATCGACGGCCTCAGTCAGGG',
 'AGAAGCTGCAGCTGTCAGAAGCTC',
 'AGAATCTTGATGATGCTGCAT',
 'AGATCATGTTGCAGCTTCACT',
 'AGCTTCTGACAGCTGCAGTTTCTC',
 'AGGTATTGGCGTGCCTCAATC',
 'ATAGTTCAAGAAAGTCCTTGGAAA',
 'ATCATGCATGACAGCCTCATTT',
 'ATGGTTCAAGAAAGCCCATGGAAA',
 'CAGCAAGAACTGGATCTTAAT',
 'CAGGGATGAGGCAGAGCATGG',
 'CGACAGAAGAGAGTGAGCATA',
 'CGCTATCTATCCTGAGCTCC',
 'CTGCACTGCCTCTTCCCTGGC',
 'CTGGCCGAGGCCGTCGATTCT',
 'CTTCGGGGGAGGAGAGAAGC',
 'GATCCCGCCTTGCACCAAGTGAAT',
 'GCAGCACCATCAAGATTCAC',
 'GCGTGCAAGGAGCCAAGCATG',
 'GCGTGCACGGAGCCAAGCATA',
 'GCTAGAGGTGGCAACTGCATA',
 'GCTCACTCTCTATCTGTCAGC',
 'GCTCACTTCTCTCTCTGTCAGC',
 'GCTCACTTCTCTTTCTGTCAGC',
 'GCTCGCTCCTCTTTCTGTCAGC',
 'GGAATGTTGGCTGGCTCGAGG',
 'GGAATGTTGTCTGGCTCGAGG',
 'GGAATGTTGTCTGGCTCGGGG',
 'GGAATGTTGTCTGGTCCGAG',
 'GGAATGTTGTCTGGTTCAAGG',
 'GGATATTGGTGCGGTTCAATC',
 'GGATTGAGCCGCGTCAATATC',
 'GGATTGTTGTCTGGTTCAAGG',
 'GGTCAAGAAAGCTGTGGGAAG',
 'GGTTTGTTGTCTGGCTCGAGG',
 'GTAATATACTAATCCGTGCAT',
 'GTTGCACGGGTTTGTATGTTG',
 'TAGCCAAGGATGACTTGCCTG',
 'TAGCCAAGGATGATTTGCCTG',
 'TAGGATTCAATCCTTGCTGCT',
 'TCAGTGCAATCCCTTTGGAAT',
 'TCCAAAGGGATCGCATTGATCT',
 'TCCACAGGCTTTCTTGAACTG',
 'TCGCTTGGTGCAGATCGGGAC',
 'TCGGACCAGGCTTCAATCCCT',
 'TCGGACCAGGCTTCATTCCCC',
 'TCGGACCAGGCTTCATTCCTC',
 'TCTCCACAGGCTTTCTTGAACT',
 'TCTCTCTCTCCCTTGAAGGC',
 'TGAAGCTGCCAGCATGATCTA',
 'TGAAGCTGCCAGCATGATCTG',
 'TGACAGAAGAGAGTGAGCAC',
 'TGAGTCGCTCTTATCACTCATG',
 'TGATTGAGCCGTGCCAATATC',
 'TGCAGTTGCTGCCTCAAGCTT',
 'TGCCTGGCTCCCTGTATGCCA',
 'TGGCAAGTCTCCTCGGCTACC',
 'TGGTGATAAGGGTGTAGCTCTG',
 'TGTTGGCCCGGCTCACTCAGA',
 'TGTTGGCTCGGCTCACTCAGA',
 'TTCCACAGCTTTCTTGAACTT',
 'TTGCTGCCTCAAGCTTGCTGC'}

# Remove redundant

## cdhit-est

In [142]:
!./Software/cdhit/cd-hit-est -i ./{temp_path}/mature_microRNA_queries.fasta  -o ./{temp_path}/NR_mature_microRNA_queries.fasta \
    -c 1 -r 0 -G 1 -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS 0 \
    -AS 99999999 -s 0 -S 0

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 23 2021, 21:45:39
Command: ./Software/cdhit/cd-hit-est -i
         ./Experiment/O.sativa_Test/Temp/mature_microRNA_queries.fasta
         -o
         ./Experiment/O.sativa_Test/Temp/NR_mature_microRNA_queries.fasta
         -c 1 -r 0 -G 1 -g 1 -b 30 -l 10 -aL 0 -AL 99999999 -aS
         0 -AS 99999999 -s 0 -S 0

Started: Fri Jun  3 19:05:25 2022
                            Output                              
----------------------------------------------------------------
total seq: 9676
longest and shortest : 28 and 17
Total letters: 206953
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 1 X 12M = 12M
Table           : 1 X 16M = 16M
Miscellaneous   : 0M
Total           : 30M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 96168110

comparing sequences from          0  to       9676
.........
     9676  finished       

## reformat

In [143]:
with open(f'./{temp_path}/NR_mature_microRNA_queries.fasta.clstr','r') as file:
    text = file.read()
lines = [line for line in text.split('\n') if len(line) > 0]
cluster = []
seqid = []
last_cluster = ""
for l in lines:
    if(l[0]=='>'):        
        last_cluster = l.replace('>Cluster ',"C")
    else:        
        cluster.append(last_cluster)
        seqid.append(l.split(', >')[1].split('...')[0])                
seq2cluster = pd.DataFrame({'seqid': seqid,'cluster': cluster})
print(seq2cluster.shape)
seq2cluster.head(2)    

(9676, 2)


Unnamed: 0,seqid,cluster
0,cst-miR11332,C0
1,stu-miR7994b-5p,C1


In [144]:
df = fasta_to_df(f"./{temp_path}/mature_microRNA_queries.fasta")
df['accession'] = df['tag'].apply(lambda x : x.split(' ')[0])
seq2cluster = pd.merge(df,seq2cluster,how="inner",left_on='accession',right_on="seqid")
seq2cluster = pd.merge(seq2cluster, mature,how="inner",left_on='tag',right_on="tag")[['cluster','seqid','tag', 'confidence']]
print(seq2cluster.shape)
display(seq2cluster.head(2))
seq2cluster.to_csv(f'./{temp_path}/seq2cluster.csv',index=False)

(9676, 4)


Unnamed: 0,cluster,seqid,tag,confidence
0,C5049,ath-miR156a-5p,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...,False
1,C1074,ath-miR156a-3p,ath-miR156a-3p MIMAT0031865 Arabidopsis thalia...,False


In [145]:
# todo: sorted first by cluster then by seqid
seq2cluster.sort_values("cluster").head(2)

Unnamed: 0,cluster,seqid,tag,confidence
8708,C0,cst-miR11332,cst-miR11332 MIMAT0044622 Cucumis sativus miR1...,False
6293,C1,stu-miR7994b-5p,stu-miR7994b-5p MIMAT0031188 Solanum tuberosum...,False


In [146]:
df = fasta_to_df(f"./{temp_path}/NR_mature_microRNA_queries.fasta")
df['tag'] = df['tag'].apply(lambda x : x.split(' ')[0])
df = pd.merge(df,seq2cluster,how="inner",left_on='tag',right_on="seqid")[['cluster','data']]

lines = []
df.apply(lambda row: lines.append(f">{row['cluster']}\n{row['data']}\n"),axis=1)
print(df.shape)
with open(f'./{temp_path}/BLASTn_Viridi','w') as file:
    file.write(''.join(lines))

(5544, 2)


# BlastN

!sudo apt-get install ncbi-blast+


In [34]:
!makeblastdb -in {temp_path}/BLASTn_Subject.fasta -dbtype nucl -out ./{temp_path_f}/blastn_database



Building a new DB, current time: 05/29/2022 21:07:42
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
New DB title:  Experiment/O.sativa_Test/Temp/BLASTn_Subject.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 5544 sequences in 0.22806 seconds.


In [36]:
!blastn -query ./{temp_path}/BLASTn_queries.fasta \
        -out ./{temp_path}/BLASTn_result \
        -num_threads {mp.cpu_count()} \
        -db ./{temp_path}/blastn_database \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [62]:
df_blastn = pd.read_csv(f'./{temp_path}/BLASTn_result', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(30514, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C533,C3879,1,20,2,21,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,2e-06,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,21
1,C533,C1612,1,20,3,22,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,2e-06,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,22


In [86]:
df_blastn = df_blastn[df_blastn["evalue"] <= 0.001]

In [38]:
df_blastn['qseqid'].unique().shape

(576,)

In [63]:
threshold = 3
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(2400, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C533,C3879,1,20,2,21,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,2e-06,37.4,...,100.0,1/1,1,1,plus,100,100,20,21,0
1,C533,C1612,1,20,3,22,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,2e-06,37.4,...,100.0,1/1,1,1,plus,100,100,20,22,0


In [64]:
df_blastn['qseqid'].unique().shape

(159,)

In [90]:
df_blastn['evalue'].max()

0.000525

In [91]:
df_blastn[df_blastn['evalue'] == df_blastn['evalue'].max()]

Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
746,C1078,NC_029261.1,1,22,27674759,27674780,TGAAGCTGCCAGCATGATCTGG,TGAAGCTGCCAGCATGATCTGG,0.000525,41.0,...,100.0,1/1,1,1,plus,100,100,22,31248787,0
749,C1078,NC_029257.1,1,22,3742503,3742482,TGAAGCTGCCAGCATGATCTGG,TGAAGCTGCCAGCATGATCTGG,0.000525,41.0,...,100.0,1/-1,1,-1,minus,100,100,22,35937250,0
1001,C1080,NC_029256.1,1,22,2309253,2309232,TCCAAAGGGATCGCATTGATCC,TCCAAAGGGATCGCATTGATCC,0.000525,41.0,...,100.0,1/-1,1,-1,minus,100,100,22,43270923,0
3277,C1083,NC_029263.1,1,22,21478393,21478414,GCTCACTTCTCTTTCTGTCAGC,GCTCACTTCTCTTTCTGTCAGC,0.000525,41.0,...,100.0,1/1,1,1,plus,100,100,22,28443022,0
3278,C1083,NC_029263.1,1,22,21491395,21491416,GCTCACTTCTCTTTCTGTCAGC,GCTCACTTCTCTTTCTGTCAGC,0.000525,41.0,...,100.0,1/1,1,1,plus,100,100,22,28443022,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196060,C1795,NC_029260.1,1,22,5230734,5230755,TGCTCACTTCTCTTTCTGTCAG,TGCTCACTTCTCTTTCTGTCAG,0.000525,41.0,...,100.0,1/1,1,1,plus,100,100,22,29958434,0
204012,C842,NC_029258.1,3,24,30392819,30392798,TTCGGACCAGGCTTCATTCCCC,TTCGGACCAGGCTTCATTCCCC,0.000525,41.0,...,100.0,1/-1,1,-1,minus,100,92,24,36413819,2
215190,C2004,NC_029265.1,1,22,14483325,14483304,TCGGACCAGGCTTCATTCCCCT,TCGGACCAGGCTTCATTCCCCT,0.000525,41.0,...,100.0,1/-1,1,-1,minus,100,100,22,23207287,0
215191,C2004,NC_029265.1,1,22,19987244,19987265,TCGGACCAGGCTTCATTCCCCT,TCGGACCAGGCTTCATTCCCCT,0.000525,41.0,...,100.0,1/1,1,1,plus,100,100,22,23207287,0


In [40]:
df_blastn.to_csv(f'./{temp_path}/filtered_out_blastn.csv')

# Result of the blastn to bed file

In [41]:
flanking_value = 200
df = df_blastn[['qseqid', 'sseqid', 'sstart', 'send', 'sstrand','slen']]
df['ones'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ones'] = 1


In [42]:
def switch(row):
    if(row['sstart'] > row['send']):        
        temp = row['sstart']
        row['sstart'] = row['send']
        row['send'] = temp
    return row
df = df.apply(lambda row: switch(row), axis=1)

In [43]:
def convert(inp):
    if(inp == "plus"):
        return "forward"
    if(inp == "minus"):
        return "reverse"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['strand'] = df['sstrand'].apply(lambda x: convert(x))

In [44]:
def convert2sign(inp):
    if(inp == "plus"):
        return "+"
    if(inp == "minus"):
        return "-"
    raise Exception('Error, sstrand contains illegal word! only "plus" and "minus" are allowed')
df['sign'] = df['sstrand'].apply(lambda x: convert2sign(x))

In [45]:
df['hit_length'] = df.apply(lambda row: abs(row['send'] - row['sstart']) + 1 ,axis=1)

## convert sstart and send from location to index (range)

In [46]:
df['sstart'] = df['sstart'].apply(lambda x: x - 1)

In [47]:
df['downstream_flanking'] = df['sstart'].apply(lambda x:  flanking_value if x > flanking_value else x)

In [48]:
df['upstream_flanking'] = df.apply(lambda row:  flanking_value if (row['send']+flanking_value) <= row['slen'] else row['slen'] - row['send'],axis=1)

In [49]:
df['hit_start'] = df.apply(lambda row: row['downstream_flanking'] if row['sign'] == "+" else row['upstream_flanking'],axis=1)

In [50]:
df['hit_end'] = df.apply(lambda row: row['downstream_flanking'] + row['hit_length'] if row['sign'] == "+" else row['upstream_flanking'] + row['hit_length'],axis=1)

In [51]:
df['sstart'] = df['sstart'].apply(lambda x: max(x - flanking_value, 0))
df['send'] = df.apply(lambda row: min(row['send'] + flanking_value , row['slen']),axis=1)

In [52]:
df['tag'] = df.apply(lambda row: f">{row['sseqid']}:{row['sstart']}-{row['send']}({row['sign']})",axis=1)
df['reformated_tag'] = df['tag'].apply(lambda t: reformat(t))
df[['tag', 'reformated_tag', 'hit_start', 'hit_end']].to_csv(f'./{temp_path}/hit_index_info.csv')#, index=False)

In [53]:
df['location_tag'] = df.apply(lambda row: f">{row['sseqid']}|{row['sign']}|{row['sstart'] + 1}-{row['send']}|{row['hit_start']+1}-{row['hit_end']}",axis=1)
df[['location_tag','qseqid']].to_csv(f'./{temp_path}/pipe_seprated_location_list.csv',index=False,sep='\t')

In [54]:
df[['sseqid','sstart','send','strand','ones', 'sign']].to_csv(f'./{temp_path}/extension_index.bed', 
        index=False, header=False, sep="\t")

# Extention


In [55]:
# !sudo apt-get install bedtools

In [56]:
!bedtools getfasta -fi {input_genome_path} -fo ./{temp_path}/extended_original.txt -s -bed ./{temp_path}/extension_index.bed
!rm input_genome.fna.fai

index file Experiment/O.sativa_Test/GCF_001433935.1.fna.fai not found, generating...
rm: cannot remove 'input_genome.fna.fai': No such file or directory


In [39]:
# todo: remove duplicated
'''
df = fasta_to_df("./Temp/extended.txt")
df = df.drop_duplicates(subset=['tag'], keep='first')
df_to_fasta(df,"./Temp/extended.txt")
len(df['tag'].unique())
''';

# Convert hit region to upper case and other region to lower case

In [57]:
ext = fasta_to_df(f'./{temp_path}/extended_original.txt')
info = pd.read_csv(f'./{temp_path}/hit_index_info.csv')
info['tag'] = info['tag'].apply(lambda x: x[1:])
print(info.shape)
info.head(2)

(218408, 5)


Unnamed: 0.1,Unnamed: 0,tag,reformated_tag,hit_start,hit_end
0,0,NC_029264.1:15065055-15065475(+),>NC_0292641_15065055-15065475_+_,200,220
1,1,NC_029264.1:18288982-18289402(-),>NC_0292641_18288982-18289402_-_,200,220


In [58]:
ext = ext.sort_values(by=['tag']).reset_index()
ext['help_tag'] = ext.apply(lambda r: r['tag'] + str(r.name),axis=1)
del ext['tag']

info = info.sort_values(by=['tag']).reset_index()
info['help_tag'] = info.apply(lambda row: row['tag']+ str(row.name),axis=1)
def redefined_tag(row):
    tag = row['tag']
    [sstart, send] = tag.split(':')[-1].split('(')[0].split('-')
    sstart = int(sstart) + 1
    sign = tag.split('(')[-1].split(')')[0]    
    return f"{tag.split(':')[0]}|{sign}|{sstart}-{send}|{row['hit_start']+1}-{row['hit_end']}"
info['tag'] = info.apply(lambda row: redefined_tag(row),axis=1)
ext = pd.merge(ext,info,how='inner', on='help_tag')

def emphasis_hit(row):
    seq = list(row['data'].lower())            
    s = row['hit_start']
    e = row['hit_end']
    seq[s:e] = list(''.join(seq[s:e]).upper())    
    return ''.join(seq)
    
ext['data'] = ext.apply(lambda row: emphasis_hit(row),axis=1)
df_to_fasta(ext[['tag','data']],f"./{temp_path}/extended_modified.txt")

# blast extended modifed against o.sativa microRNA

In [47]:
!makeblastdb -in {temp_path}/extended_modified.txt -dbtype nucl -out ./{temp_path_f}/blastn_database1



Building a new DB, current time: 05/29/2022 20:44:01
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database1
New DB title:  Experiment/O.sativa_Test/Temp/extended_modified.txt
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database1
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 218408 sequences in 11.2246 seconds.


In [48]:
!blastn -query ./{temp_path}/BLASTn_queries.fasta \
        -out ./{temp_path}/BLASTn_result1 \
        -num_threads {mp.cpu_count()} \
        -db ./{temp_path}/blastn_database1 \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [65]:
df_blastn = pd.read_csv(f'./{temp_path}/BLASTn_result1', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(150214, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C533,NC_029264.1|+|15065147-15065568|201-222,1,20,110,129,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.002,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,422
1,C533,NC_029264.1|+|15065146-15065568|201-223,1,20,111,130,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.002,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,423


In [66]:
df_blastn = df_blastn[df_blastn["evalue"] <= 0.001]

In [67]:
df_blastn['qseqid'].unique().shape

(282,)

In [63]:
threshold = 3
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(16343, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
550,C226,NC_029256.1|+|4666250-4666670|201-221,1,21,201,221,GCTCACTCTCTATCTGTCAGC,GCTCACTCTCTATCTGTCAGC,0.000432,39.2,...,100.0,1/1,1,1,plus,100,100,21,421,0
551,C226,NC_029256.1|+|4666250-4666670|201-221,1,21,201,221,GCTCACTCTCTATCTGTCAGC,GCTCACTCTCTATCTGTCAGC,0.000432,39.2,...,100.0,1/1,1,1,plus,100,100,21,421,0


In [64]:
df_blastn['qseqid'].unique().shape

(281,)

# Diamond

In [111]:
dmn = pd.read_csv(f"./{temp_path}/diamond_matches.tsv", sep='\t', header=None)
dmn.columns = 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'.split(' ')
coding_seq = dmn['qseqid'].unique()

In [112]:
def clear(inp):
    if(inp[:9] == "reverse::"):
        return inp[9:]
    if(inp[:9] == "forward::"):
        return inp[9:]
    return inp
coding_seq = pd.Series(coding_seq).apply(lambda x : clear(x))

ext = fasta_to_df(f'./{temp_path}/extended_modified.txt')
print(ext.shape)
ext = ext[~ext['tag'].isin(coding_seq)]
print(ext.shape)
df_to_fasta(ext,f'./{temp_path}/extended_modified_non_coding.txt')

(218408, 2)
(181872, 2)


In [113]:
!makeblastdb -in {temp_path}/extended_modified_non_coding.txt -dbtype nucl -out ./{temp_path_f}/blastn_database2



Building a new DB, current time: 05/28/2022 22:44:03
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database2
New DB title:  Experiment/O.sativa_Test/Temp/extended_modified_non_coding.txt
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 181872 sequences in 8.77493 seconds.


In [114]:
!blastn -query ./{temp_path}/BLASTn_queries.fasta \
        -out ./{temp_path}/BLASTn_result2 \
        -num_threads {mp.cpu_count()} \
        -db ./{temp_path}/blastn_database2 \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [184]:
df_blastn = pd.read_csv(f'./{temp_path}/BLASTn_result2', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(127402, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C533,NC_029264.1|+|15065147-15065568|201-222,1,20,110,129,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.001,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,422
1,C533,NC_029264.1|+|15065059-15065479|201-221,1,20,198,217,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.001,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,421


In [185]:
df_blastn = df_blastn[df_blastn["evalue"] <= 0.001]

In [186]:
df_blastn['qseqid'].unique().shape

(281,)

In [131]:
threshold = 4
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(74881, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C533,NC_029264.1|+|15065147-15065568|201-222,1,20,110,129,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.001,37.4,...,100.0,1/1,1,1,plus,100,100,20,422,0
1,C533,NC_029264.1|+|15065059-15065479|201-221,1,20,198,217,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.001,37.4,...,100.0,1/1,1,1,plus,100,100,20,421,0


In [132]:
df_blastn['qseqid'].unique().shape

(504,)

In [71]:
result = pd.read_csv(f"{result_path}/result_level1_filter.csv")
result.shape

(82318, 134)

In [73]:
result['tag'] = (result['seq name'] + result['ct name'])
result['data'] = result['precursor seq']
df_to_fasta(result[['tag', 'data']], f"{temp_path}/filter1_to_blast.csv")

In [74]:
!makeblastdb -in {temp_path}/filter1_to_blast.csv -dbtype nucl -out ./{temp_path_f}/blastn_database_result1



Building a new DB, current time: 05/29/2022 21:19:39
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database_result1
New DB title:  Experiment/O.sativa_Test/Temp/filter1_to_blast.csv
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database_result1
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 82318 sequences in 3.75987 seconds.


In [75]:
!blastn -query ./{temp_path}/BLASTn_queries.fasta \
        -out ./{temp_path}/BLASTn_result_filter1 \
        -num_threads {mp.cpu_count()} \
        -db ./{temp_path}/blastn_database_result1 \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [80]:
df_blastn = pd.read_csv(f'./{temp_path}/BLASTn_result_filter1', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(78960, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C533,NC_029263.1|+|21491035-21491455|201-221Fold,1,20,1,20,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.000134,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,182
1,C533,NC_029263.1|+|21491035-21491455|201-221Fold,1,20,1,20,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.000134,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,182


In [180]:
df_blastn = df_blastn[df_blastn["evalue"] <= 0.001]

In [181]:
df_blastn['qseqid'].unique().shape

(184,)

In [81]:
threshold = 0
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(4928, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C533,NC_029263.1|+|21491035-21491455|201-221Fold,1,20,1,20,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.000134,37.4,...,100.0,1/1,1,1,plus,100,100,20,182,0
1,C533,NC_029263.1|+|21491035-21491455|201-221Fold,1,20,1,20,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,0.000134,37.4,...,100.0,1/1,1,1,plus,100,100,20,182,0


In [79]:
nc3: 246
nc0: 132

In [82]:
df_blastn['qseqid'].unique().shape

(132,)

In [97]:
result = pd.read_csv(f"{result_path}/result_level2_filter.csv")
result.shape

(9946, 134)

In [98]:
result['tag'] = (result['seq name'] + result['ct name'])
result['data'] = result['precursor seq']
df_to_fasta(result[['tag', 'data']], f"{temp_path}/filter2_to_blast.csv")

In [99]:
!makeblastdb -in {temp_path}/filter2_to_blast.csv -dbtype nucl -out ./{temp_path_f}/blastn_database_result2



Building a new DB, current time: 05/29/2022 21:24:51
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database_result2
New DB title:  Experiment/O.sativa_Test/Temp/filter2_to_blast.csv
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database_result2
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 9946 sequences in 0.455464 seconds.


In [100]:
!blastn -query ./{temp_path}/BLASTn_queries.fasta \
        -out ./{temp_path}/BLASTn_result_filter2 \
        -num_threads {mp.cpu_count()} \
        -db ./{temp_path}/blastn_database_result2 \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [114]:
df_blastn = pd.read_csv(f'./{temp_path}/BLASTn_result_filter2', sep='\t',header=None)
df_blastn.columns = header.replace("  "," ").split(" ")
print(df_blastn.shape)
df_blastn.head(2)

(35484, 27)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,gaps,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen
0,C533,NC_029261.1|+|26554702-26555124|201-223Fold,1,20,3,22,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.9e-05,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,91
1,C533,NC_029261.1|+|26554702-26555124|201-223Fold,1,20,3,22,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.9e-05,37.4,...,0,100.0,1/1,1,1,plus,100,100,20,91


In [175]:
df_blastn = df_blastn[df_blastn["evalue"] <= 0.001]

In [176]:
df_blastn['qseqid'].unique().shape

(114,)

In [115]:
threshold = 0
df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
df_blastn = df_blastn[df_blastn['Nonconformity'] <= threshold]
print(df_blastn.shape)
df_blastn.head(2)

(1137, 28)


Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C533,NC_029261.1|+|26554702-26555124|201-223Fold,1,20,3,22,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.9e-05,37.4,...,100.0,1/1,1,1,plus,100,100,20,91,0
1,C533,NC_029261.1|+|26554702-26555124|201-223Fold,1,20,3,22,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.9e-05,37.4,...,100.0,1/1,1,1,plus,100,100,20,91,0


# Clean code

In [45]:
def Blast(output, subject, query):
    !makeblastdb -in {subject} -dbtype nucl -out ./{temp_path}/blastn_database
    !blastn -query {query} \
        -out {output} \
        -num_threads {mp.cpu_count()} \
        -db ./{temp_path}/blastn_database \
        -word_size 7 \
        -penalty -3 \
        -reward 2 \
        -gapopen 5 \
        -gapextend 2 \
        -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'       

In [46]:
def getBlast(path):    
    header = 'qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'
    df_blastn = pd.read_csv(path, sep='\t',header=None)
    df_blastn.columns = header.replace("  "," ").split(" ")
    #df_blastn = df_blastn[df_blastn['sstrand'] == "plus"]
    #df_blastn = df_blastn[df_blastn["evalue"] <= 0.001]
    df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']
    temp = df_blastn.copy()
    temp = temp[temp['Nonconformity'] <= 3]
    print(f'NC: 3 result: {temp["qseqid"].unique().shape}')
    return temp

# query Viridi, subject o.sativa genome

In [47]:
# quert o.sativ, subject: viridi
output = f"{temp_path}/blast1"
query = f"{temp_path}/BLASTn_Viridi" 
subject = input_genome_path
Blast(output=output,query=query,subject=subject)



Building a new DB, current time: 06/04/2022 21:35:09
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
New DB title:  Experiment/O.sativa_Test/GCF_001433935.1.fna
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 58 sequences in 5.09097 seconds.


In [49]:
temp = getBlast(output)

NC: 3 result: (3747,)


In [50]:
o_sativa = fasta_to_df(f"{temp_path}/BLASTn_O_Sativa")
print(o_sativa.shape)
o_sativa['data'] = o_sativa['data'].apply(lambda x: x.replace("U", "T"))
o_sativa.head(2)

(577, 2)


Unnamed: 0,tag,data
0,C533,TGACAGAAGAGAGTGAGCAC
1,C226,GCTCACTCTCTATCTGTCAGC


In [53]:
len(temp[temp['sseq'].isin(o_sativa['data'])]['sseq'].unique())

131

# quert o.sativ, subject: viridi

In [22]:
# quert o.sativ, subject: viridi
output = f"{temp_path}/blast1"
query = f"{temp_path}/BLASTn_O_Sativa"
subject = f"{temp_path}/BLASTn_Viridi" 
Blast(output=output,query=query,subject=subject)



Building a new DB, current time: 06/04/2022 21:00:38
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
New DB title:  Experiment/O.sativa_Test/Temp/BLASTn_Viridi
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 5544 sequences in 0.239723 seconds.


In [23]:
temp = getBlast(output)

NC: 3 result: (159,)


In [29]:
o_sativa = fasta_to_df(f"{temp_path}/BLASTn_O_Sativa")
print(o_sativa.shape)
o_sativa['data'] = o_sativa['data'].apply(lambda x: x.replace("U", "T"))
o_sativa.head(2)

(577, 2)


Unnamed: 0,tag,data
0,C533,TGACAGAAGAGAGTGAGCAC
1,C226,GCTCACTCTCTATCTGTCAGC


In [42]:
temp

Unnamed: 0,qseqid,sseqid,qstart,qend,sstart,send,qseq,sseq,evalue,bitscore,...,ppos,frames,qframe,sframe,sstrand,qcovs,qcovhsp,qlen,slen,Nonconformity
0,C533,C3879,1,20,2,21,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.900000e-06,37.4,...,100.00,1/1,1,1,plus,100,100,20,21,0
1,C533,C1612,1,20,3,22,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.900000e-06,37.4,...,100.00,1/1,1,1,plus,100,100,20,22,0
2,C533,C3526,1,20,1,20,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.900000e-06,37.4,...,100.00,1/1,1,1,plus,100,100,20,21,0
3,C533,C3375,1,20,2,21,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.900000e-06,37.4,...,100.00,1/1,1,1,plus,100,100,20,21,0
4,C533,C2439,1,20,1,20,TGACAGAAGAGAGTGAGCAC,TGACAGAAGAGAGTGAGCAC,1.900000e-06,37.4,...,100.00,1/1,1,1,plus,100,100,20,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25841,C206,C2055,2,22,2,22,TTGTTTTTCTCCAATATCTCA,TTGATTTCCTCCAATATCTCA,3.630000e-04,30.1,...,90.48,1/1,1,1,plus,95,95,22,22,3
25842,C206,C1724,2,22,2,22,TTGTTTTTCTCCAATATCTCA,TTGGTTTCCTCCAATATCTCA,3.630000e-04,30.1,...,90.48,1/1,1,1,plus,95,95,22,22,3
25843,C206,C1217,2,22,2,22,TTGTTTTTCTCCAATATCTCA,TTGTTTTCCTCTAATATCTCA,3.630000e-04,30.1,...,90.48,1/1,1,1,plus,95,95,22,22,3
26453,C212,C250,1,22,1,21,AATGTTTGTATGGATCGTTTGT,AATGTTTGTA-GAATAGTTTGT,5.400000e-02,23.8,...,86.36,1/1,1,1,plus,100,100,22,24,3


In [39]:
len(temp['qseq'].unique())

446

In [35]:
len(temp[temp['qseq'].isin(o_sativa['data'])]['qseq'].unique())

134

In [231]:
len(set(temp['qseq']).intersection(conf))

42

In [221]:
len(set(temp['qseq']).intersection(not_conf))

24

In [183]:
selected['full name'] = selected['name'] + " => "+selected['tree']

In [80]:
conf2organism = {}
for c in temp['qseq'].unique():
    data = selected[selected['data'].isin([c])]    
    conf2organism[c] = data['full name'].unique()    

In [83]:
with open('./out.txt','w') as file:
    s = ""
    for c in conf2organism:            
        s += c + "\n"
        for i in conf2organism[c]:
            s += i + "\n"
        s += "\n"
    file.write(s)

# query O.sativ, subject Level1 hit seq

In [15]:
result = pd.read_csv(f"{result_path}/result_level1_filter.csv")
result.shape

(82318, 134)

In [16]:
#577 mir O.s unique, 63 conf unique
#level1 92 mir O.s unique,  33 conf unique

In [17]:
result['tag'] = (result['seq name'] + result['ct name'])
result['data'] = result['hit seq']
df_to_fasta(result[['tag', 'data']], f"{temp_path}/filter1_to_blast.csv")

In [18]:
output = f"{temp_path}/blast1"
query = f"{temp_path}/BLASTn_O_Sativa"
subject = f"{temp_path}/filter1_to_blast.csv"
Blast(output=output,query=query,subject=subject)

NameError: name 'Blast' is not defined

In [30]:
getBlast(output)

NC:0  result: (92,)
NC:3  result: (152,)


# query O.sativ, subject Level1 precursor

In [31]:
result = pd.read_csv(f"{result_path}/result_level1_filter.csv")
result.shape

(82318, 134)

In [32]:
result['tag'] = (result['seq name'] + result['ct name'])
result['data'] = result['precursor seq']
df_to_fasta(result[['tag', 'data']], f"{temp_path}/filter1_to_blast.csv")

In [33]:
output = f"{temp_path}/blast1"
query = f"{temp_path}/BLASTn_O.sativa.fasta"
subject = f"{temp_path}/filter1_to_blast.csv"
Blast(output=output,query=query,subject=subject)



Building a new DB, current time: 05/31/2022 21:05:21
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
New DB title:  Experiment/O.sativa_Test/Temp/filter1_to_blast.csv
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 82318 sequences in 3.70684 seconds.


In [34]:
getBlast(output)

NC:0  result: (122,)
NC:3  result: (225,)


# query O.sativ, subject Level2 hit

In [106]:
result = pd.read_csv(f"{result_path}/result_level2_filter.csv")
result.shape

(9946, 134)

In [107]:
len(result['hit seq'].unique())

951

In [108]:
sum(pd.Series(result['hit seq'].unique()).isin(pd.Series(confidence)))

19

In [36]:
result['tag'] = (result['seq name'] + result['ct name'])
result['data'] = result['hit seq']
df_to_fasta(result[['tag', 'data']], f"{temp_path}/filter2_to_blast.csv")

In [37]:
output = f"{temp_path}/blast1"
query = f"{temp_path}/BLASTn_O.sativa.fasta"
subject = f"{temp_path}/filter2_to_blast.csv"
Blast(output=output,query=query,subject=subject)



Building a new DB, current time: 05/31/2022 21:05:46
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
New DB title:  Experiment/O.sativa_Test/Temp/filter2_to_blast.csv
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 9946 sequences in 0.427442 seconds.


In [38]:
getBlast(output)

NC:0  result: (40,)
NC:3  result: (99,)


# query O.sativ, subject Level2 precursor

In [51]:
result = pd.read_csv(f"{result_path}/result_level2_filter.csv")
result.shape

(9946, 134)

In [40]:
result['tag'] = (result['seq name'] + result['ct name'])
result['data'] = result['precursor seq']
df_to_fasta(result[['tag', 'data']], f"{temp_path}/filter2_to_blast.csv")

In [41]:
output = f"{temp_path}/blast1"
query = f"{temp_path}/BLASTn_O.sativa.fasta"
subject = f"{temp_path}/filter2_to_blast.csv"
Blast(output=output,query=query,subject=subject)



Building a new DB, current time: 05/31/2022 21:05:53
New DB name:   /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
New DB title:  Experiment/O.sativa_Test/Temp/filter2_to_blast.csv
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/jupyter/plant_microRNA_prediction/Experiment/O.sativa_Test/Temp/blastn_database
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 9946 sequences in 0.447794 seconds.


In [42]:
getBlast(output)

NC:0  result: (52,)
NC:3  result: (131,)
