# Import

In [267]:
from __future__ import division
import warnings
warnings.filterwarnings('ignore')
import sys
from numpy.random import randint
from numpy.random import rand
import multiprocessing as mp
import random
import math
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from tqdm.contrib.concurrent import process_map
sys.path.append("./src/")
from filter2 import convert, filter2_run
import os

In [268]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [269]:
class DotDict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [270]:
experiment = "O.sativa_Test"
experiment_dir = "Experiment"
temp_path = f"{experiment_dir}/{experiment}/Temp"
result_path = f"{experiment_dir}/{experiment}/Result"

# Algorithms

## PSO

In [263]:
class Particle:
    def __init__(self, x0,num_dimensions):
        self.num_dimensions = num_dimensions
        self.position_i=[]          # particle position
        self.velocity_i=[]          # particle velocity
        self.pos_best_i=[]          # best position individual
        self.err_best_i=-1          # best error individual
        self.err_i=-1               # error individual

        for i in range(0,num_dimensions):
            self.velocity_i.append(random.uniform(-1,1))
            self.position_i.append(x0[i])

    # evaluate current fitness
    def evaluate(self, costFunc):
        self.err_i=costFunc(self.position_i)

        # check to see if the current position is an individual best
        if self.err_i < self.err_best_i or self.err_best_i==-1:
            self.pos_best_i=self.position_i
            self.err_best_i=self.err_i

    # update new particle velocity
    def update_velocity(self, pos_best_g):
        w = 0.7       # constant inertia weight (how much to weigh the previous velocity)
        c1 = 1.4        # cognative constant
        c2 = 1.4        # social constant

        for i in range(0, self.num_dimensions):
            r1 = random.random()
            r2 = random.random()            
            vel_cognitive = c1*r1*(self.pos_best_i[i]-self.position_i[i])
            vel_social = c2*r2*(pos_best_g[i]-self.position_i[i])
            self.velocity_i[i] = w*self.velocity_i[i]+vel_cognitive+vel_social

    # update the particle position based off new velocity updates
    def update_position(self):
        for i in range(0, self.num_dimensions):
            self.position_i[i] = self.position_i[i] + self.velocity_i[i]

            # adjust maximum position if necessary
            if self.position_i[i]> 1:
                self.position_i[i]= 1

            # adjust minimum position if neseccary
            if self.position_i[i] < 0:
                self.position_i[i] = 0


def evaluation(particle):
    particle.evaluate(cost_function)
    return particle


class PSO:
    def __init__(self, costFunc, num_particles, maxiter, num_dimensions):
        err_best_g = -1
        pos_best_g = []        
        swarm = []
                
        swarm = []
        for i in range(0, num_particles):
            x0 = [random.uniform(0, 1) for j in range(num_dimensions)]
            swarm.append(Particle(x0, num_dimensions))            
        
        ############## main loop #######################
        for i in range(maxiter):                       
            counter = 0 
            #for particle in process_map(evaluation, swarm, tqdm_class=tqdm, max_workers=22, chunksize=5):
            for s in swarm:
                particle = evaluation(s)
                if particle.err_i < err_best_g or err_best_g == -1:
                    pos_best_g=list(particle.position_i)
                    err_best_g=float(particle.err_i)                
                swarm[counter] = particle
                counter += 1
            
            # cycle through swarm and update velocities and position                        
            for j in range(0,num_particles):
                swarm[j].update_velocity(pos_best_g)
                swarm[j].update_position()                        
            print(err_best_g)                        
            print(costFunc(pos_best_g, show=True))
        # print final results        
        print(pos_best_g)
        print(err_best_g)

## GA

In [5]:
class Individual:
    def __init__(self, costFunc, bounds, num_dimensions, r_mut, new=True):
        self.num_dimensions = num_dimensions
        self.bounds = bounds    
        self.r_mut = r_mut
        self.costFunc = costFunc
        if(new):
            self.random_solution()
            self.evaluate()
    
    def copy(self):
        clone = Individual(self.costFunc, self.bounds, self.num_dimensions, self.r_mut, new=False)
        clone.position = self.position.copy()
        clone.cost = self.cost
        return clone
    
    def random_solution(self):
        self.position = [random.uniform(bound[0], bound[1]) for bound in self.bounds]        
        
    def evaluate(self):
        self.cost = self.costFunc(self.position)        
        
    def mutation(self):                
        for i in range(self.num_dimensions):
            if rand() < self.r_mut:        
                self.position[i] = random.uniform(self.bounds[i][0], self.bounds[i][1])                        

class Genetic_Algorithm:
    def create_individual(self, inp):
        return Individual(self.costFunc, self.bounds, self.num_dimensions, self.r_mut)
    
    def selection(self, scores, k=3):    
        selection_ix = randint(len(self.pop))        
        for ix in randint(0, len(self.pop), k-1):        
            if scores[ix] < scores[selection_ix]:
                selection_ix = ix
        return self.pop[selection_ix]
    
    def crossover_one_point(self, p1, p2):    
        c1 = p1.copy()
        c2 = p2.copy()            
        if rand() < self.r_cross:                 
            pt = randint(1, len(p1.position)-2)                    
            c1.position = p1.position[:pt] + p2.position[pt:]
            c2.position = p2.position[:pt] + p1.position[pt:]                
        return [c1, c2]
    
    def crossover_uniform(self, p1, p2):    
        c1 = p1.copy()
        c2 = p2.copy()            
        for i in range(len(p1.position)):
            if(rand() < 0.5):
                c1.position[i] = p2.position[i] 
                c2.position[i] = p1.position[i]                 
        return [c1, c2]
    
    def apply_operations(self, index):        
        p1 = self.selected[index]        
        p2 = self.selected[index+1]      
        if(rand() > 0.5):
            c1, c2 = self.crossover_one_point(p1, p2)                        
        else:
            c1, c2 = self.crossover_uniform(p1, p2)                        
        c1.mutation()
        c2.mutation()
        c1.evaluate()
        c2.evaluate()
        return [c1, c2]

    def __init__(self, costFunc, bounds, num_individual, maxiter, num_dimensions, r_cross, r_mut):                    
        self.costFunc = costFunc
        self.bounds = bounds
        self.num_individual = num_individual
        self.maxiter = maxiter
        self.num_dimensions = num_dimensions
        self.r_cross = r_cross
        self.r_mut = r_mut
        
        best = self.create_individual(None)
        
        self.pop = []
        for individual in process_map(self.create_individual, range(0,num_individual), tqdm_class=tqdm, max_workers=22, chunksize=5):
            self.pop.append(individual)                        
    
        for gen in tqdm(range(maxiter)):
            scores = [individual.cost for individual in self.pop]        
            for individual in self.pop:
                if individual.cost < best.cost:
                    best = individual.copy()
                    
            self.selected = [self.selection(scores) for _ in range(num_individual)]        
            self.pop = list()
            for [p1, p2] in process_map(self.apply_operations, range(0, num_individual, 2), tqdm_class=tqdm, max_workers=22, chunksize=5):
                self.pop.append(p1)
                self.pop.append(p2)       
            print(best.cost)
            print(best.position)
        return [best.cost, best.position]

# Main

In [300]:
level1 = pd.read_csv(f"{result_path}/result_level1_filter.csv")
level1 = level1.apply(lambda row: convert(row), axis=1)    

In [283]:
df = level1[level1['confidence']]

In [152]:
bounds = [        
        (-1000,-999),#"delta_g_min": -1000,
        (0, 1),#"delta_g_max": 0,
        (19, 22),#"hit_len_min": 19,
        (21, 26),#"hit_len_max": 24,
        (0, 0.5),#"hit_complementarity_percentage_min": 0.3,
        (0.5, 1),#"hit_complementarity_percentage_max": 1,
        (0, 2),#"number_of_terminal_structure_min": 1,
        (0, 6),#"number_of_terminal_structure_max": 1,
        (0, 50),#"boi_gc_content_min": 20,
        (50, 100),#"boi_gc_content_max": 80,
        (0, 50),#"num_of_linking_residues_min": 5,
        (20, 1000),#"num_of_linking_residues_max": 80,
        (0, 50),#"hit_gc_content_percentage_min": 20,
        (50, 100),#"hit_gc_content_percentage_max": 80,
        (0, 1.5),#"precursor_mfei_min": 0.5,
        (0.5, 5),#"precursor_mfei_max":3,
        (0, 7),#"border_line_mismatch_max": 1,
        (0, 7),#"border_line_bulge_max": 0,
        (0, 7),#"border_line_internal_max": 0,
        (0, 20),#"total_num_of_nonmatching_positions": 5,
        (0, 20),#"total_num_of_mismached_positions": 5,
        (0, 20),#"total_num_of_positions_in_bulges_and_loops": 3,
        (0, 20),#"max_allowed_mismatch_size_in_hit_region": 2,
        (0, 20),#"max_allowed_bulge_size_in_hit_region": 1,
        (0, 20),#"max_allowed_internal_loop_size_in_hit_region": 0,
        (0, 20),#"max_allowed_hsbl_ssbl_size": 4,
        (0, 50),#"minimum_required_clear_region": 13,
        (0, 10),#"acceptable_num_for_hit_locations_in_bulges_or_loops": 3,
        (0, 10),#"acceptable_num_for_unmatched_locations_in_hit_region": 5,
        (0, 1),#"delete_if_mature_duplex_involvement_in_apical_loop": "yes",
        (0, 2),#"border_line_structure_allowance": "1 end only"
    ]

In [1]:
confidence = set(['TGACAGAAGAGAGTGAGCAC',
              'GCTCACTCTCTATCTGTCAGC',
              'GCTCACTTCTCTCTCTGTCAGC',
              'GCTCACTTCTCTTTCTGTCAGC',
              'GCTCGCTCCTCTTTCTGTCAGC',
              'TGCCTGGCTCCCTGTATGCCA',
              'GCGTGCAAGGAGCCAAGCATG',
              'GCGTGCACGGAGCCAAGCATA',
              'GGAATGTTGTCTGGTTCAAGG',
              'TCGGACCAGGCTTCATTCCCC',
              'GGAATGTTGTCTGGCTCGGGG',
              'GGAATGTTGTCTGGTCCGAG',
              'GGAATGTTGTCTGGCTCGAGG',
              'TGAAGCTGCCAGCATGATCTA',
              'ATCATGCATGACAGCCTCATTT',
              'TTCCACAGCTTTCTTGAACTT',
              'GGTCAAGAAAGCTGTGGGAAG',
              'CGACAGAAGAGAGTGAGCATA',
              'GGTTTGTTGTCTGGCTCGAGG',
              'TCGGACCAGGCTTCAATCCCT',
              'GGATTGTTGTCTGGTTCAAGG',
              'TGAAGCTGCCAGCATGATCTG',
              'AGATCATGTTGCAGCTTCACT',
              'TCGCTTGGTGCAGATCGGGAC',
              'GATCCCGCCTTGCACCAAGTGAAT',
              'TGGTGATAAGGGTGTAGCTCTG',
              'TAGCCAAGGATGACTTGCCTG',
              'TGAGTCGCTCTTATCACTCATG',
              'GGATATTGGTGCGGTTCAATC',
              'TGATTGAGCCGTGCCAATATC',
              'TGTTGGCCCGGCTCACTCAGA',
              'TGTTGGCTCGGCTCACTCAGA',
              'GGAATGTTGGCTGGCTCGAGG',
              'TCGGACCAGGCTTCATTCCTC',
              'TCCAAAGGGATCGCATTGATCT',
              'TCAGTGCAATCCCTTTGGAAT',
              'CAGGGATGAGGCAGAGCATGG',
              'CTGCACTGCCTCTTCCCTGGC',
              'GCAGCACCATCAAGATTCAC',
              'AGAATCTTGATGATGCTGCAT',
              'AGGTATTGGCGTGCCTCAATC',
              'GGATTGAGCCGCGTCAATATC',
              'AAGCTCAGGAGGGATAGCGCC',
              'CGCTATCTATCCTGAGCTCC',
              'TCCACAGGCTTTCTTGAACTG',
              'ATGGTTCAAGAAAGCCCATGGAAA',
              'GCTAGAGGTGGCAACTGCATA',
              'TGCAGTTGCTGCCTCAAGCTT',
              'TTGCTGCCTCAAGCTTGCTGC',
              'TAGGATTCAATCCTTGCTGCT',
              'CAGCAAGAACTGGATCTTAAT',
              'GTAATATACTAATCCGTGCAT',
              'GTTGCACGGGTTTGTATGTTG',
              'TAGCCAAGGATGATTTGCCTG',
              'TGGCAAGTCTCCTCGGCTACC',
              'TCTCCACAGGCTTTCTTGAACT',
              'ATAGTTCAAGAAAGTCCTTGGAAA',
              'TCTCTCTCTCCCTTGAAGGC',
              'CTTCGGGGGAGGAGAGAAGC',
              'AATCGACGGCCTCAGTCAGGG',
              'CTGGCCGAGGCCGTCGATTCT',
              'AGCTTCTGACAGCTGCAGTTTCTC',
              'AGAAGCTGCAGCTGTCAGAAGCTC'])

63

In [265]:
def Blast(output, subject, query):
    command = f'''makeblastdb -in {subject} -dbtype nucl -out ./{temp_path}/blastn_database'''
    os.system(command)
    command = f'''
    blastn -query {query} \
    -out ./{temp_path}/blast_temp \
    -num_threads {mp.cpu_count()} \
    -db ./{temp_path}/blastn_database \
    -word_size 7 \
    -penalty -3 \
    -reward 2 \
    -gapopen 5 \
    -gapextend 2 \
    -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'
    '''
    os.system(command)
    
    
def getBlast(path):    
    header = 'qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'
    df_blastn = pd.read_csv(path, sep='\t', header=None)
    df_blastn.columns = header.replace("  "," ").split(" ")
    df_blastn = df_blastn[df_blastn['sstrand'] == "plus"]    
    df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']    
    df_blastn = df_blastn[df_blastn['Nonconformity'] <= 0]
    return df_blastn["qseq"].unique()    
    
def cost_function(solution, alpha = 0.92 , beta=0.45, show=False):
    sol = [bounds[i][0] + (solution[i] * (bounds[i][1] - bounds[i][0])) for i in range(0, len(bounds))]
    deleted_type = ["NO", "YES"]
    allwance_type = ["NOT ACCEPTED", "1 END ONLY", "2 SIDE"]
    config = {
        "delta_g_min": sol[0],
        "delta_g_max": sol[1],
        "hit_len_min": round(sol[2]),
        "hit_len_max": round(sol[3]),
        "hit_complementarity_percentage_min": sol[4],
        "hit_complementarity_percentage_max": sol[5],
        "number_of_terminal_structure_min": round(sol[6]),
        "number_of_terminal_structure_max": round(sol[7]),
        "boi_gc_content_min": round(sol[8]),
        "boi_gc_content_max": round(sol[9]),
        "num_of_linking_residues_min": round(sol[10]),
        "num_of_linking_residues_max": round(sol[11]),
        "hit_gc_content_percentage_min": round(sol[12]),
        "hit_gc_content_percentage_max": round(sol[13]),
        "precursor_mfei_min": sol[14],
        "precursor_mfei_max": sol[15],
        "border_line_mismatch_max": round(sol[16]),
        "border_line_bulge_max": round(sol[17]),
        "border_line_internal_max": round(sol[18]),
        "total_num_of_nonmatching_positions": round(sol[19]),
        "total_num_of_mismached_positions": round(sol[20]),
        "total_num_of_positions_in_bulges_and_loops": round(sol[21]),
        "max_allowed_mismatch_size_in_hit_region": round(sol[22]),
        "max_allowed_bulge_size_in_hit_region": round(sol[23]),
        "max_allowed_internal_loop_size_in_hit_region": round(sol[24]),
        "max_allowed_hsbl_ssbl_size": round(sol[25]),
        "minimum_required_clear_region": round(sol[26]),
        "acceptable_num_for_hit_locations_in_bulges_or_loops": round(sol[27]),
        "acceptable_num_for_unmatched_locations_in_hit_region": round(sol[28]),
        "delete_if_mature_duplex_involvement_in_apical_loop": deleted_type[round(sol[29])],
        "border_line_structure_allowance": allwance_type[round(sol[30])]
    }    
    config = DotDict(config)
    result = filter2_run(level1.copy(), config)
    if(result.shape[0] == 0):
        return 10**6
    
    subject = f"./{temp_path}/filter_level2_temp.csv"
    output  = f"./{temp_path}/blast_temp"
    query   = f"./{temp_path}/BLASTn_O.sativa.fasta"    
    
    result['tag'] = (result['seq name'] + result['ct name'])        
    result['data'] = result['hit seq']
    df_to_fasta(result[['tag', 'data']], subject)        
    
    Blast(output=output, query=query, subject=subject)
    out = set(getBlast(output))    
    missing = 1000 - len(out)    
    find_conf = len(out.intersection(confidence))    
    missing_conf = 1000 - find_conf
    fp = len(set(result['hit seq']) - out)
    if(show):
        print(f'conf: {find_conf}, find: {len(out)}, fp: {fp}')
        print(config)
        print(solution)
    return missing * alpha + (1-alpha) * fp + missing_conf * beta

In [297]:
####################################### new ######################################
def Blast(output, subject, query):
    command = f'''makeblastdb -in {subject} -dbtype nucl -out ./{temp_path}/blastn_database'''
    os.system(command)
    command = f'''
    blastn -query {query} \
    -out ./{temp_path}/blast_temp \
    -num_threads {mp.cpu_count()} \
    -db ./{temp_path}/blastn_database \
    -word_size 7 \
    -penalty -3 \
    -reward 2 \
    -gapopen 5 \
    -gapextend 2 \
    -outfmt '6 qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'
    '''
    os.system(command)
    
    
def getBlast(path):    
    header = 'qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen'
    df_blastn = pd.read_csv(path, sep='\t', header=None)
    df_blastn.columns = header.replace("  "," ").split(" ")
    df_blastn = df_blastn[df_blastn['sstrand'] == "plus"]    
    df_blastn['Nonconformity'] = df_blastn['qlen'] - (abs(df_blastn['qend'] - df_blastn['qstart']) + 1) + df_blastn['gaps'] + df_blastn['mismatch']    
    df_blastn = df_blastn[df_blastn['Nonconformity'] <= 0]
    return df_blastn["qseq"].unique()    
    
def cost_function(solution, alpha = 0.925, show=False):
    sol = [bounds[i][0] + (solution[i] * (bounds[i][1] - bounds[i][0])) for i in range(0, len(bounds))]
    deleted_type = ["NO", "YES"]
    allwance_type = ["NOT ACCEPTED", "1 END ONLY", "2 SIDE"]
    config = {
        "delta_g_min": sol[0],
        "delta_g_max": sol[1],
        "hit_len_min": round(sol[2]),
        "hit_len_max": round(sol[3]),
        "hit_complementarity_percentage_min": sol[4],
        "hit_complementarity_percentage_max": sol[5],
        "number_of_terminal_structure_min": round(sol[6]),
        "number_of_terminal_structure_max": round(sol[7]),
        "boi_gc_content_min": round(sol[8]),
        "boi_gc_content_max": round(sol[9]),
        "num_of_linking_residues_min": round(sol[10]),
        "num_of_linking_residues_max": round(sol[11]),
        "hit_gc_content_percentage_min": round(sol[12]),
        "hit_gc_content_percentage_max": round(sol[13]),
        "precursor_mfei_min": sol[14],
        "precursor_mfei_max": sol[15],
        "border_line_mismatch_max": round(sol[16]),
        "border_line_bulge_max": round(sol[17]),
        "border_line_internal_max": round(sol[18]),
        "total_num_of_nonmatching_positions": round(sol[19]),
        "total_num_of_mismached_positions": round(sol[20]),
        "total_num_of_positions_in_bulges_and_loops": round(sol[21]),
        "max_allowed_mismatch_size_in_hit_region": round(sol[22]),
        "max_allowed_bulge_size_in_hit_region": round(sol[23]),
        "max_allowed_internal_loop_size_in_hit_region": round(sol[24]),
        "max_allowed_hsbl_ssbl_size": round(sol[25]),
        "minimum_required_clear_region": round(sol[26]),
        "acceptable_num_for_hit_locations_in_bulges_or_loops": round(sol[27]),
        "acceptable_num_for_unmatched_locations_in_hit_region": round(sol[28]),
        "delete_if_mature_duplex_involvement_in_apical_loop": deleted_type[round(sol[29])],
        "border_line_structure_allowance": allwance_type[round(sol[30])]
    }    
    config = DotDict(config)
    #result = filter2_run(df.copy(), config)
    result = df.copy()
    if(result.shape[0] == 0):
        return 10**6
    
    subject = f"./{temp_path}/filter_level2_temp.csv"
    output  = f"./{temp_path}/blast_temp"
    query   = f"./{temp_path}/BLASTn_O_Sativa"    
    
    result['tag'] = (result['seq name'] + result['ct name'])        
    result['data'] = result['hit seq']
    df_to_fasta(result[['tag', 'data']], subject)        
    
    Blast(output=output, query=query, subject=subject)
    out = set(getBlast(output))                
    fp = len(set(result['hit seq']) - out)
    if(show):
        print(f'conf:{len(out)}, fp: {fp}')
        print(config)
        print(solution)
    return -1 * (len(out) * alpha ) + (1-alpha) * fp

In [None]:
PSO(cost_function , num_particles = 250, maxiter=100, num_dimensions=31)

#  fine tune

In [None]:
def get_junction_distance(data, dist, thresh_bulge, thresh_loop):
    distance = []
    for d in data:
        if(d['type'] == 'loop'):
            size = float(eval(d['size']))
            if(size >= thresh_loop):                
                distance.append(float(d['dist']))
        if(d['type'] == 'bulge'):
            if(float(d['size']) >= thresh_bulge):
                distance.append(float(d['dist']))
    distance.append(dist)
    return min(distance)


def _convert_hr(x):
    out = []
    x = eval(x)
    for item in x:                
        out.append({'type':item.split("=")[0],
                           'dist': item.split("dist:")[1].split(', ')[0],
                           'size': item.split("size:")[1]})
    return out

config = {'delta_g_min': -999,
          'delta_g_max': 1,
          'hit_len_min': 21,
          'hit_len_max': 21,
          'hit_complementarity_percentage_min': 0.5,
          'hit_complementarity_percentage_max': 1.0,
          'number_of_terminal_structure_min': 0,
          'number_of_terminal_structure_max': 5,
          'boi_gc_content_min': 45,
          'boi_gc_content_max': 94, 
          'num_of_linking_residues_min': 5, 
          'num_of_linking_residues_max': 159,
          'hit_gc_content_percentage_min': 37, 
          'hit_gc_content_percentage_max': 86,
          'precursor_mfei_min': 0.87,
          'precursor_mfei_max': 1.3695556794836854, 
          'border_line_mismatch_max': 0,
          'border_line_bulge_max': 0,
          'border_line_internal_max': 0,
          'total_num_of_nonmatching_positions': 5,
          'total_num_of_mismached_positions': 5,
          'total_num_of_positions_in_bulges_and_loops': 2,
          'max_allowed_mismatch_size_in_hit_region': 2,
          'max_allowed_bulge_size_in_hit_region': 1,
          'max_allowed_internal_loop_size_in_hit_region': 3,
          'max_allowed_hsbl_ssbl_size': 2,
          'minimum_required_clear_region': 0,
          'acceptable_num_for_hit_locations_in_bulges_or_loops': 2,
          'acceptable_num_for_unmatched_locations_in_hit_region': 5,
          'delete_if_mature_duplex_involvement_in_apical_loop': 'YES',
          'border_line_structure_allowance': 'NOT ACCEPTED'}


df = level1.copy()

'''
effective_bulge_size_in_Hit_vicinity_regions = 6
effective_internal_loop_size_in_Hit_vicinity_regions = 8

df["distal"] = df["distal distance"].apply(lambda x : _convert_hr(x))
df["proximal"] = df["proximal distance"].apply(lambda x : _convert_hr(x))


def junc_distal(row):
    return get_junction_distance(row['distal'], row['base structure corrected length'], effective_bulge_size_in_Hit_vicinity_regions, effective_internal_loop_size_in_Hit_vicinity_regions)      

def junc_prox(row):
    return get_junction_distance(row['proximal'], row['primary stem corrected length'], effective_bulge_size_in_Hit_vicinity_regions, effective_internal_loop_size_in_Hit_vicinity_regions)      

df["Loop distal junction distance"] = df[['distal','base structure corrected length']].apply(lambda row: junc_distal(row), axis=1)
df["Loop proximal junction distance"] = df[['proximal', 'primary stem corrected length']].apply(lambda row: junc_prox(row), axis=1)
'''


config = DotDict(config)
result = filter2_run(df, config)
    
subject = f"./{temp_path}/filter_level2_temp.csv"
output  = f"./{temp_path}/blast_temp"
query   = f"./{temp_path}/BLASTn_O.sativa.fasta"    
    
result['tag'] = (result['seq name'] + result['ct name'])        
result['data'] = result['hit seq']
df_to_fasta(result[['tag', 'data']], subject)        
    
Blast(output=output, query=query, subject=subject)
out = set(getBlast(output))    
missing = 92 - len(out)    
find_conf = len(out.intersection(confidence))    
missing_conf = 33 - find_conf
fp = len(set(result['hit seq']) - out)
print(out.intersection(confidence))
print()
print()
print(f'conf: {find_conf}, find: {len(out)}, fp: {fp}')
print(config)