# Imports

In [13]:
from __future__ import division
import warnings
warnings.filterwarnings('ignore')
import sys
from numpy.random import randint
from numpy.random import rand
import multiprocessing as mp
import random
import math
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from tqdm.contrib.concurrent import process_map
sys.path.append("./src/")
from filter2 import convert, filter2_run
import os

# Configs & paths

In [8]:
experiment = "O.sativa_Test"
experiment_dir = "Experiment"
temp_path = f"{experiment_dir}/{experiment}/Temp"
result_path = f"{experiment_dir}/{experiment}/Result"

# Functions

In [3]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [4]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [5]:
class DotDict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

# Load dataset

In [6]:
level1 = pd.read_csv(f"{result_path}/result_level1_filter.csv")
level1 = level1.apply(lambda row: convert(row), axis=1)

In [18]:
o_sativa = fasta_to_df(f"{temp_path}/BLASTn_O_Sativa")
print(o_sativa.shape)
o_sativa['data'] = o_sativa['data'].apply(lambda x: x.replace("U", "T"))
o_sativa.head(2)

(577, 2)


Unnamed: 0,tag,data
0,C533,TGACAGAAGAGAGTGAGCAC
1,C226,GCTCACTCTCTATCTGTCAGC


In [26]:
Y = level1['hit seq'].isin(o_sativa['data'])
Y = Y.apply(lambda x: 1 if x else 0)

565

# Preprocess: convert ct-analizer result to feature vector

In [97]:
cat_cols = ['mir type']

In [98]:
cols = ['hit len',
        'flanking GC content',
        'flanking MFEI',
        'hit GC content',        
        'hit complementarity percentage',        
        'num of linking residues',
        'boi GC content',
        'boi delta G',
        'boi AMFE',
        'boi MFEI',        
        'precursor gc content',
        'precursor delta G',
        'precursor AMFE',
        'precursor MFEI',
        'number of terminal structures',
        'primary stem length',
        'mismatch',
        'bulge',
        'internal loop',
        'base structure corrected length',
        'primary stem corrected length',
        'Loop distal junction distance',
        'Loop proximal junction distance']
X = level1[cols]

In [99]:
# replace inf with max
for c in X.columns:
    m = X[X[c] != np.inf][c].max()
    X[c].replace([np.inf], m, inplace=True)

In [100]:
# standardization
mu = X.mean()
std = X.std()
X = (X - mu) / std

# Deep learning model