# Imports

In [1]:
from __future__ import division
import warnings
warnings.filterwarnings('ignore')
import sys
from numpy.random import randint
from numpy.random import rand
import multiprocessing as mp
import random
import math
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from tqdm.contrib.concurrent import process_map
sys.path.append("./src/")
from filter2 import convert, filter2_run
import os

# Configs & paths

In [2]:
experiment = "O.sativa_Test"
experiment_dir = "Experiment"
temp_path = f"{experiment_dir}/{experiment}/Temp"
result_path = f"{experiment_dir}/{experiment}/Result"

# Functions

In [3]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [7]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [4]:
class DotDict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

# Load dataset

In [5]:
level1 = pd.read_csv(f"{result_path}/result_level1_filter.csv")
level1 = level1.apply(lambda row: convert(row), axis=1)

In [12]:
o_sativa = fasta_to_df(f"{temp_path}/BLASTn_O_Sativa")
print(o_sativa.shape)
o_sativa['data'] = o_sativa['data'].apply(lambda x: x.replace("U", "T"))
o_sativa.head(2)

(577, 2)


Unnamed: 0,tag,data
0,C533,TGACAGAAGAGAGTGAGCAC
1,C226,GCTCACTCTCTATCTGTCAGC


In [18]:
level1[level1['hit seq'].isin(o_sativa['data'])]['hit seq'].unique().shape

(90,)

In [13]:
confidence = set(['TGACAGAAGAGAGTGAGCAC', 'GCTCACTCTCTATCTGTCAGC', 'GCTCACTTCTCTCTCTGTCAGC', 'GCTCACTTCTCTTTCTGTCAGC', 'GCTCGCTCCTCTTTCTGTCAGC', 'TGCCTGGCTCCCTGTATGCCA', 'GCGTGCAAGGAGCCAAGCATG', 'GCGTGCACGGAGCCAAGCATA', 'GGAATGTTGTCTGGTTCAAGG', 'TCGGACCAGGCTTCATTCCCC', 'GGAATGTTGTCTGGCTCGGGG', 'GGAATGTTGTCTGGTCCGAG', 'GGAATGTTGTCTGGCTCGAGG', 'TGAAGCTGCCAGCATGATCTA', 'ATCATGCATGACAGCCTCATTT', 'TTCCACAGCTTTCTTGAACTT', 'GGTCAAGAAAGCTGTGGGAAG', 'CGACAGAAGAGAGTGAGCATA', 'GGTTTGTTGTCTGGCTCGAGG', 'TCGGACCAGGCTTCAATCCCT', 'GGATTGTTGTCTGGTTCAAGG', 'TGAAGCTGCCAGCATGATCTG', 'AGATCATGTTGCAGCTTCACT', 'TCGCTTGGTGCAGATCGGGAC', 'GATCCCGCCTTGCACCAAGTGAAT', 'TGGTGATAAGGGTGTAGCTCTG', 'TAGCCAAGGATGACTTGCCTG', 'TGAGTCGCTCTTATCACTCATG', 'GGATATTGGTGCGGTTCAATC', 'TGATTGAGCCGTGCCAATATC', 'TGTTGGCCCGGCTCACTCAGA', 'TGTTGGCTCGGCTCACTCAGA', 'GGAATGTTGGCTGGCTCGAGG', 'TCGGACCAGGCTTCATTCCTC', 'TCCAAAGGGATCGCATTGATCT', 'TCAGTGCAATCCCTTTGGAAT', 'CAGGGATGAGGCAGAGCATGG', 'CTGCACTGCCTCTTCCCTGGC', 'GCAGCACCATCAAGATTCAC', 'AGAATCTTGATGATGCTGCAT', 'AGGTATTGGCGTGCCTCAATC', 'GGATTGAGCCGCGTCAATATC', 'AAGCTCAGGAGGGATAGCGCC', 'CGCTATCTATCCTGAGCTCC', 'TCCACAGGCTTTCTTGAACTG', 'ATGGTTCAAGAAAGCCCATGGAAA', 'GCTAGAGGTGGCAACTGCATA', 'TGCAGTTGCTGCCTCAAGCTT', 'TTGCTGCCTCAAGCTTGCTGC', 'TAGGATTCAATCCTTGCTGCT', 'CAGCAAGAACTGGATCTTAAT', 'GTAATATACTAATCCGTGCAT', 'GTTGCACGGGTTTGTATGTTG', 'TAGCCAAGGATGATTTGCCTG', 'TGGCAAGTCTCCTCGGCTACC', 'TCTCCACAGGCTTTCTTGAACT', 'ATAGTTCAAGAAAGTCCTTGGAAA', 'TCTCTCTCTCCCTTGAAGGC', 'CTTCGGGGGAGGAGAGAAGC', 'AATCGACGGCCTCAGTCAGGG', 'CTGGCCGAGGCCGTCGATTCT', 'AGCTTCTGACAGCTGCAGTTTCTC', 'AGAAGCTGCAGCTGTCAGAAGCTC']) 

In [18]:
level1.shape

(82318, 134)

In [16]:
for c in level1.columns:
    print(c)

Reference miRNA cluster
Reference miRNA IDs
Reference miRNA IDs and species
confidence
seq name
ct name
ct
pdf
hit start
hit end
sign
chromosome
hit position on chromosome
delta G
full seq
hit seq
hit len
flanking GC content
flanking MFEI
hit GC content
complementarity in hit region
hit complementarity percentage
hit self complementarity
continuous pairing
istar min
istar max
mir type
star start
star start msg
star end
star end msg
star seq
num of linking residues
star branching
boi start
boi end
boi seq
boi name
boi GC content
boi dotbracket
boi delta G
boi AMFE
boi MFEI
boi visualization
precursor gc content
precursor dotbracket
precursor delta G
precursor AMFE
precursor MFEI
precursor name
precursor seq
precursor seq visualization
terminal structure range
number of terminal structures
branch#1 start point
branch#1 end point
branch#1 total length
branch#1 apical loop start
branch#1 apical loop end
branch#1 apical loop size
branch#1 stem last residue
branch#1 stem length
branch#2 star