# Common

In [1]:
import json
import time
from subprocess import Popen, PIPE, STDOUT
import math
import numpy as np
import pandas as pd
import hashlib
import requests
import os, sys, subprocess
from tqdm.contrib.concurrent import process_map
from tqdm.notebook import tqdm
tqdm.pandas()
import multiprocessing as mp
import shutil
import urllib.parse
import glob
import os
import sys
import networkx
from networkx.algorithms.clique import find_cliques as maximal_cliques
from ast import literal_eval
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
sys.path.append("./src/")
from ct_analizer import get_row
from filter1 import filter1_run
from filter2 import filter2_run

In [2]:
def fasta_to_df(path):
    with open(path, 'r') as file:
        text = file.read()
    lines = [line for line in text.split('\n') if len(line) > 0]
    s = ''
    tags = []
    data = []
    for l in lines:
        if(l[0]=='>'):
            tags.append(l)        
            data.append(s)
            s = ''
        else:
            s += l    
    data.append(s)
    df = pd.DataFrame(
            {
                'tag': tags,
                'data': data[1:]
            })
    df['tag'] = df['tag'].apply(lambda x: x[1:])    
    return df

In [3]:
def df_to_fasta(df, path):
    lines = []
    df.apply(lambda row: lines.append(f">{row['tag']}\n{row['data']}\n"),axis=1)
    with open(path,'w') as file:
        file.write(''.join(lines))

# Download data from Mirbase

In [33]:
directory = "./miRBase_driven_data"
base = "https://www.mirbase.org/ftp/CURRENT"

In [34]:
mature = fasta_to_df(f'{directory}/mature.fa')
mature_high_conf = fasta_to_df(f'{directory}/mature_high_conf.fa')
mature['trim tag'] = mature['tag'].apply(lambda line: ' '.join(line.split(' ')[:2]))
mature['confidence'] = mature['trim tag'].isin(mature_high_conf['tag'])

In [35]:
mature['organism'] = mature['tag'].apply(lambda x: x[:3])
print(mature.shape)
mature.head(2)

(48885, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
0,cel-let-7-5p MIMAT0000001 Caenorhabditis elega...,UGAGGUAGUAGGUUGUAUAGUU,cel-let-7-5p MIMAT0000001,True,cel
1,cel-let-7-3p MIMAT0015091 Caenorhabditis elega...,CUAUGCAAUUUUCUACCUUACC,cel-let-7-3p MIMAT0015091,True,cel


In [36]:
organism = pd.read_csv(f'./{directory}/organisms.txt',sep='\t')
organism.columns = [c.replace('#','') for c in organism.columns] # remove sharp from columns
print(organism.shape)
organism.head(2)

(285, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
0,aqu,AQU,Amphimedon queenslandica,Metazoa;Porifera;,400682
1,nve,NVE,Nematostella vectensis,Metazoa;Cnidaria;,45351


In [39]:
items = list(organism['tree'].unique())
items.sort(key=len)
items

['Viruses;',
 'Mycetozoa;',
 'Alveolata;',
 'Metazoa;Porifera;',
 'Metazoa;Cnidaria;',
 'Viridiplantae;Chlorophyta;',
 'Viridiplantae;Embryophyta;',
 'Viridiplantae;Coniferophyta;',
 'Viridiplantae;Magnoliophyta;',
 'Metazoa;Bilateria;Deuterostoma;',
 'Chromalveolata;Heterokontophyta;',
 'Metazoa;Bilateria;Ecdysozoa;Nematoda;',
 'Metazoa;Bilateria;Lophotrochozoa;Annelida;',
 'Metazoa;Bilateria;Lophotrochozoa;Nemertea;',
 'Metazoa;Bilateria;Lophotrochozoa;Mollusca;',
 'Viridiplantae;Magnoliophyta;monocotyledons;',
 'Metazoa;Bilateria;Deuterostoma;Hemichordata;',
 'Metazoa;Bilateria;Deuterostoma;Echinodermata;',
 'Metazoa;Bilateria;Lophotrochozoa;Brachiopoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Hexapoda;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Crustacea;',
 'Metazoa;Bilateria;Lophotrochozoa;Platyhelminthes;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Chelicerata;',
 'Metazoa;Bilateria;Ecdysozoa;Arthropoda;Mandibulata;',
 'Viridiplantae;Magnoliophyta;eudicotyledons;Poaceae;',
 'M

In [48]:
selectedTree = organism[organism['tree'].apply(lambda x: "Viridiplantae;Magnoliophyta;monocotyledons;" in x)]
print(selectedTree.shape)
selectedTree.head(5)

(78, 5)


Unnamed: 0,organism,division,name,tree,NCBI-taxid
70,ath,ATH,Arabidopsis thaliana,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3702
71,bna,BNA,Brassica napus,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3708
72,bol,BOL,Brassica oleracea,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3712
73,bra,BRA,Brassica rapa,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3711
74,cpa,CPA,Carica papaya,Viridiplantae;Magnoliophyta;eudicotyledons;Car...,3649


In [49]:
selectedTree = selectedTree[selectedTree['name'] != "Oryza sativa"]
selectedTree

Unnamed: 0,organism,division,name,tree,NCBI-taxid
70,ath,ATH,Arabidopsis thaliana,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3702
71,bna,BNA,Brassica napus,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3708
72,bol,BOL,Brassica oleracea,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3712
73,bra,BRA,Brassica rapa,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,3711
74,cpa,CPA,Carica papaya,Viridiplantae;Magnoliophyta;eudicotyledons;Car...,3649
...,...,...,...,...,...
258,cst,CST,Cucumis sativus,Viridiplantae;Magnoliophyta;eudicotyledons;Cuc...,3659
262,cas,CAS,Camelina sativa,Viridiplantae;Magnoliophyta;eudicotyledons;Bra...,90675
264,pla,PLA,Paeonia lactiflora,Viridiplantae;Magnoliophyta;eudicotyledons;Pae...,35924
274,smi,SMI,Salvia miltiorrhiza,Viridiplantae;Magnoliophyta;eudicotyledons;Lam...,226208


In [50]:
selected = mature[mature['organism'].isin(selectedTree['organism'])]
print(selected.shape)
selected.head(1)

(8559, 5)


Unnamed: 0,tag,data,trim tag,confidence,organism
316,ath-miR156a-5p MIMAT0000166 Arabidopsis thalia...,UGACAGAAGAGAGUGAGCAC,ath-miR156a-5p MIMAT0000166,False,ath


In [51]:
len(selected['data'].unique())

4780

In [22]:
selected = pd.merge(selected, selectedTree, how="inner", left_on="organism", right_on="organism")
selected.shape

(738, 9)

In [38]:
for name in selectedTree['name']:
    tree = selectedTree[selectedTree['name'] == name]
    _selected = pd.merge(selected, tree, how="inner", left_on="organism", right_on="organism")
    conf = _selected[_selected['confidence']]    
    conf = conf.drop_duplicates(subset=['data'], keep='first')
    not_conf = _selected[~_selected['confidence']]
    not_conf = not_conf.drop_duplicates(subset=['data'], keep='first')
    print(f'{str(name).ljust(30," ")}  mir not confidence: {str(not_conf.shape[0]).ljust(5," ")}mir conf: {conf.shape[0]}')   

Chlamydomonas reinhardtii       mir not confidence: 85   mir conf: 0
Pinus taeda                     mir not confidence: 32   mir conf: 0
Physcomitrella patens           mir not confidence: 215  mir conf: 0
Selaginella moellendorffii      mir not confidence: 60   mir conf: 0
Arabidopsis thaliana            mir not confidence: 350  mir conf: 0
Brassica napus                  mir not confidence: 53   mir conf: 0
Brassica oleracea               mir not confidence: 10   mir conf: 0
Brassica rapa                   mir not confidence: 127  mir conf: 0
Carica papaya                   mir not confidence: 52   mir conf: 0
Glycine max                     mir not confidence: 569  mir conf: 0
Lotus japonicus                 mir not confidence: 291  mir conf: 0
Medicago truncatula             mir not confidence: 426  mir conf: 0
Vigna unguiculata               mir not confidence: 17   mir conf: 0
Gossypium herbaceum             mir not confidence: 1    mir conf: 0
Gossypium hirsutum              mi