In [1]:
experiment = "O.sativa"
input_genome_name = "GCF_001433935.1.fna"


experiment_dir = "Experiment"

In [2]:
input_genome_path = f'{experiment_dir}/{experiment}/{input_genome_name}'

temp_path = f"{experiment_dir}/{experiment}/Temp"
result_path = f"{experiment_dir}/{experiment}/Result"

temp_path_f = temp_path.replace(" ", "\ ")
result_path_f = result_path.replace(" ", "\ ")

# Common

In [3]:
#!pip install tqdm

In [4]:
import json
import time
from subprocess import Popen, PIPE, STDOUT
import math
import numpy as np
import pandas as pd
import hashlib
import requests
import os, sys, subprocess
from tqdm.contrib.concurrent import process_map
from tqdm.notebook import tqdm
tqdm.pandas()
import multiprocessing as mp
import shutil
import urllib.parse
import glob
import os
import sys
import networkx
from networkx.algorithms.clique import find_cliques as maximal_cliques
from ast import literal_eval
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
sys.path.append("./src/")
from ct_analizer import get_row
from filter1 import filter1_run
from filter2 import filter2_run

In [5]:
if(not os.path.exists(temp_path)):
    os.mkdir(temp_path)
    
if(not os.path.exists(result_path)):
    os.mkdir(result_path)

In [6]:
current_path = os.getcwd()

In [7]:
def bracket_row(row):    
    s = row['data']
    index = min(s.find('.'), s.find('('))
    data = row['data']
    row['data'] = data[0:index]
    row['bracket'] = data[index:]
    return row

In [8]:
def adjust(text,n=7):
    text = str(text)    
    return " " * (n - len(text)) + text

In [9]:
def bracket_to_ct(tag, data, bracket, deltaG, negative_deltaG=True):    
    deltaG = deltaG.replace('(','').replace(')','')
    deltaG = float(deltaG)
    if(deltaG > 0 and negative_deltaG ): # negetive?!
        deltaG = -1 * deltaG
    stack = []
    index = np.zeros((len(bracket)), dtype = int)
    values = np.zeros((len(bracket)), dtype = int)
    for i in range(len(bracket)):
        index[i] = i + 1
        if(bracket[i] == '.'):
            values[i] = 0
        elif(bracket[i] == '('):
            stack.append(i)
        elif(bracket[i] == ')'):
            if(len(stack) == 0 ):
                print('structure error!')
            values[stack[-1]] = i + 1
            values[i]  = stack[-1] + 1
            stack.pop()
        else:
            print('structure error!')
    if(len(stack) != 0 ):
        print('structure error!')
    # body    
    ct = f"{adjust(len(data),6)} dG ={adjust(deltaG,10)} {tag}\n"   
    for i in range(len(bracket)):
        ct += f"{adjust(index[i],6)} {data[i]} {adjust(i,6)} {adjust((i+2)%(len(data)+1),6)} {adjust(values[i],6)} {adjust(index[i],7)}\n"
    return ct

In [10]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [11]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

In [12]:
def reformat(path):
    return path.replace('(','_').replace(')','_').replace('.','').replace(':','_')

In [13]:
def reformatCT(path):
    with open(path, 'r') as file:
        text = file.read()
    text = [l for l in text.split('\n') if len(l) > 0 ] # remove blank lines
    text = '\n'.join(text)
    text = text.replace("\t"," ")
    while("  " in text):
        text = text.replace("  ", " ")
    lines = [l for l in text.split('\n')]
    for i in range(len(lines)):
        if(lines[i][0] == " "):
            lines[i] = lines[i][1:]
        if(lines[i][-1] == " "):
            lines[i] = lines[i][:-1]
    text = '\n'.join(lines)
    return text

In [14]:
def get_ct_data(ct):
    ct = "\n".join(ct.split('\n')[1:])
    df = pd.read_csv(StringIO(ct), sep=" ", header=None)               
    nucleotide = df.iloc[:,1]
    index = df.iloc[:,5]
    values = df.iloc[:,4]
    return [nucleotide, index, values]

In [15]:
def ct2dot_bracket(path):
    [nucleotide, index, values] = get_ct_data(reformatCT(path))
    text = ''.join(nucleotide) + "\n"
    watch = []
    for i, v in zip(index,values):
        if(v == 0):
            text += '.'
        else:
            if( v not in watch):
                text += '('
                watch.append(i)
            if( v in watch):
                text += ')'
    return text

In [16]:
def is_nested(index, values):
    max_value = max(index) + 10 # inf
    for i, v in zip(index, values):
        if(v < max_value and v != 0):
            max_value  = v
        if(i >= max_value):
            max_value = max(index) + 10 # inf
        if(v > max_value):
            return False               
    return True

# Download data from Mirbase

In [17]:
directory = './miRBase_driven_data'

In [18]:
mature = fasta_to_df(f'{directory}/mature.fa')
mature_high_conf = fasta_to_df(f'{directory}/mature_high_conf.fa')
mature['trim tag'] = mature['tag'].apply(lambda line: ' '.join(line.split(' ')[:2]))
mature['confidence'] = mature['trim tag'].isin(mature_high_conf['tag'])

In [19]:
mature['organism'] = mature['tag'].apply(lambda x: x[:3])
print(mature.shape)
mature.head(2)

(48885, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
0,cel-let-7-5p MIMAT0000001 Caenorhabditis elega...,UGAGGUAGUAGGUUGUAUAGUU,cel-let-7-5p MIMAT0000001,True,cel
1,cel-let-7-3p MIMAT0015091 Caenorhabditis elega...,CUAUGCAAUUUUCUACCUUACC,cel-let-7-3p MIMAT0015091,True,cel


In [20]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [21]:
items = list(organism['tree'].unique())
items.sort(key=len)
items

['Viruses;',
 'Mycetozoa;',
 'Alveolata;',
 'Metazoa;Porifera;',
 'Metazoa;Cnidaria;',
 'Viridiplantae;Chlorophyta;',
 'Viridiplantae;Embryophyta;',
 'Viridiplantae;Coniferophyta;',
 'Viridiplantae;Magnoliophyta;',
 'Metazoa;Bilateria;Deuterostoma;',
 'Chromalveolata;Heterokontophyta;',
 'Metazoa;Bilateria;Ecdysozoa;Nematoda;',
 'Metazoa;Bilateria;Lophotrochozoa;Annelida;',
 'Metazoa;Bilateria;Lophotrochozoa;Nemertea;',
 'Metazoa;Bilateria;Lophotrochozoa;Mollusca;',
 'Viridiplantae;Magnoliophyta;monocotyledons;',
 'Metazoa;Bilateria;Deuterostoma;Hemichordata;',
 'Metazoa;Bilateria;Deuterostoma;Echinodermata;',
 'Metazoa;Bilateria;Lophotrochozoa;Brachiopoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Hexapoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Crustacea;',
 'Metazoa;Bilateria;Lophotrochozoa;Platyhelminthes;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Chelicerata;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Mandibulata;',
 'Viridiplantae;Magnoliophyta;eudicotyledons;Poaceae;',
 'M

In [22]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;" in x)]
print(selectedTree.shape)
selectedTree.head(5)

(86, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
66,cre,CRE,Chlamydomonas reinhardtii,Viridiplantae;Chlorophyta;,3055
67,pta,PTA,Pinus taeda,Viridiplantae;Coniferophyta;,3352
68,ppt,PPT,Physcomitrella patens,Viridiplantae;Embryophyta;,3218
69,smo,SMO,Selaginella moellendorffii,Viridiplantae;Embryophyta;,88036
70,ath,ATH,Arabidopsis thaliana,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3702


In [23]:
selectedTree = selectedTree[selectedTree['name'] == "Oryza sativa"]

In [24]:
selected = mature[mature['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(1)

(738, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
1068,osa-miR156a MIMAT0000618 Oryza sativa miR156a,UGACAGAAGAGAGUGAGCAC,osa-miR156a MIMAT0000618,False,osa


In [25]:
# remove redundant
selected = selected.drop_duplicates(subset=['data'], keep='first')
selected.shape

(577, 5)

In [26]:
selected

Unnamed: 0,tag,data,trim tag,confidence,organism
1068,osa-miR156a MIMAT0000618 Oryza sativa miR156a,UGACAGAAGAGAGUGAGCAC,osa-miR156a MIMAT0000618,False,osa
1070,osa-miR156b-3p MIMAT0022845 Oryza sativa miR15...,GCUCACUCUCUAUCUGUCAGC,osa-miR156b-3p MIMAT0022845,True,osa
1072,osa-miR156c-3p MIMAT0022846 Oryza sativa miR15...,GCUCACUUCUCUCUCUGUCAGC,osa-miR156c-3p MIMAT0022846,True,osa
1076,osa-miR156f-3p MIMAT0022847 Oryza sativa miR15...,GCUCACUUCUCUUUCUGUCAGC,osa-miR156f-3p MIMAT0022847,True,osa
1083,osa-miR156j-3p MIMAT0022850 Oryza sativa miR15...,GCUCGCUCCUCUUUCUGUCAGC,osa-miR156j-3p MIMAT0022850,True,osa
...,...,...,...,...,...
43057,osa-miR2120b-3p MIMAT0044645 Oryza sativa miR2...,UUUAGUCGCGGUUGGUGUUA,osa-miR2120b-3p MIMAT0044645,False,osa
43058,osa-miR5801c-5p MIMAT0044646 Oryza sativa miR5...,AUCGUUUCCGAUCGUUGGAUC,osa-miR5801c-5p MIMAT0044646,False,osa
43059,osa-miR5801c-3p MIMAT0044647 Oryza sativa miR5...,UAGAUCCAACAAUCGAAAACG,osa-miR5801c-3p MIMAT0044647,False,osa
43060,osa-miR6245b-5p MIMAT0044648 Oryza sativa miR6...,UUUAAUAGAACCGACACCUAU,osa-miR6245b-5p MIMAT0044648,False,osa


In [29]:
result = pd.read_csv(f"{result_path}/result_level1_filter.csv")
result.shape

(82318, 134)

In [30]:
hits = result['hit seq'].unique()
hits.shape

(11172,)

In [31]:
hits = pd.Series(hits).apply(lambda x: x.lower())

In [32]:
selected['data'] = selected['data'].apply(lambda x: x.replace("U", "T").lower())

In [33]:
selected['data'] 

1068       tgacagaagagagtgagcac
1070      gctcactctctatctgtcagc
1072     gctcacttctctctctgtcagc
1076     gctcacttctctttctgtcagc
1083     gctcgctcctctttctgtcagc
                  ...          
43057      tttagtcgcggttggtgtta
43058     atcgtttccgatcgttggatc
43059     tagatccaacaatcgaaaacg
43060     tttaatagaaccgacacctat
43061     gtataggtgtcggttctatta
Name: data, Length: 577, dtype: object

In [34]:
both = selected[selected['data'].isin(hits)]

In [35]:
both.shape

(90, 5)

In [38]:
sum(both['confidence'])

27