# GSRV & Real Datasets

## Enviroment preparation

In [19]:
import qiime2.plugins.taxa.actions as taxa_actions

import os
from os import system
from os.path import join, basename, dirname, exists, splitext

%load_ext autoreload
# %load_ext memory_profiler
%matplotlib inline

import multiprocessing as mp
import logging
# from memory_profiler import memory_usage

import matplotlib.pyplot as plt

# Custom functions
from utils import check_dir
import utils
from db_comparison import get_taxonomy, get_classifier
import mock_community

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
logging.basicConfig(
    filename='benchmarking_copy1.log', 
    format='%(asctime)s %(message)s', filemode = 'w',
    level=logging.INFO)

In [21]:
dirs = {
    'gut': "real_datasets/data/gut",
    'vagina': "real_datasets/data/vagina"
}

in_files = {
    'gut':{
        'rep-seqs': join(dirs['gut'], "rep_seqs_dada2_FFQ_DR_final.qza"),
        'table': join(dirs['gut'], "table_FFQ_DR_final.qza")}
    ,
    'vagina': {
        'rep-seqs': join(dirs['vagina'], "rep-seqs_run29_40b.qza"),
        'table': join(dirs['vagina'], "table_run29_40b.qza")
    }
}

In [22]:
# Database files
created_dir = "/new_approach/created_db"

databases = {
    "gsrv_V4": {'seq': join(created_dir, 'gsrv_V4_cluster-1_seqs.qza'),
             'taxa': join(created_dir, 'gsrv_V4_cluster-1_taxa.qza')
                 },
    "itgdb_V4": {'seq': join(created_dir, 'itgdb_V4_seqs.qza'),
             'taxa': join(created_dir, 'itgdb_V4_taxa.qza')
                 },
    "silva_V4": {'seq': join(created_dir, 'silva_V4_seqs.qza'),
             'taxa': join(created_dir, 'silva_V4_taxa.qza')
                 },
    "rdp_V4":{'seq': join(created_dir, 'rdp_V4_seqs.qza'),
             'taxa': join(created_dir, 'rdp_V4_taxa.qza')
             },
    "gg_V4":{'seq': join(created_dir, 'gg_V4_seqs.qza'),
             'taxa': join(created_dir, 'gg_V4_taxa.qza')
             }
#     ,
#     "metasquare_V4":{'seq': join(created_dir, 'metasquare_V4_seqs.qza'),
#              'taxa': join(created_dir, 'metasquare_V4_taxa.qza')
#                  }
}

## Built Classifiers 

In [23]:
classifier_dir = 'classifiers'
check_dir(classifier_dir)

In [24]:
commands = []

# Generate classifiers
cmds_nb, clf_paths_nb = get_classifier(
        databases, classifier_dir, bespoke=False, 
        p_alpha = 0.001, p_feat = '[7,7]', force=False)
    
commands.extend(cmds_nb)
classifiers_paths = clf_paths_nb

In [25]:
print(len(classifiers_paths))
print(len(commands))

5
0


In [None]:
%autoreload
mem_stamps = {}
for cmd, clf_path in zip(commands, classifiers_paths):
    # Initialize timer class
    t = utils.Timer(
        name = f"{clf_path}",
        logger = logging.info)
    # start record usage time
    t.start()
    # Run function + record memory usage
    res = memory_usage((os.system, [cmd]), 
        interval = 1, max_iterations = 1, include_children = True,
        retval=True)
    # Stop recording usage time
    t.stop()
    
    # Add memory usage information to logging
    mem = res[0]
    mem_stamps[clf_path] = mem
    logging.info(f"Memory Usage peak: {clf_path} {max(mem)* 1.048576} MB")
    logging.info(f"Memory Usage mean: {clf_path} {(sum(mem)/len(mem))* 1.048576} MB")
    
print(utils.Timer.timers)

In [None]:
# Plot memory usage
for clf_path, mem in mem_stamps.items():
    plt.plot(*zip(*enumerate(mem)), label = clf_path)

plt.legend()
plt.show()

## Taxonomy assignment

In [None]:
all_tax = {}

# Confidence value
for site in in_files.keys():
    # Rep-seqs
    rep_seqs_file = in_files[site]['rep-seqs']
    rep_seqs = utils.import_qiime_artifact(rep_seqs_file)
    
    utils.check_dir('real_datasets')
    
    for clf in classifiers_paths:
        
        # Function arguments of get_taxonomy
        args = [
            # rep_seqs_artifact, classifier_paths, out_dir, label
            rep_seqs, [clf], 'real_datasets', site,
            # method, p_confidence, threads, mock_community, force
             "naive-bayes", 'disable', 35, False, False
               ]

        # Initialize timer class
        t = utils.Timer(
            name = f"{site}-{clf}",
            logger = logging.info)
        # start record usage time
        t.start()
        # Run function + record memory usage
        res = memory_usage((get_taxonomy, args), 
            interval = 1, max_iterations = 1, include_children = True,
            retval=True)
        # Stop recording usage ime
        t.stop()  

        # Function output
        tax_nb = res[1]
        all_tax[f"{site}-{clf}"] = tax_nb

        # Add memory usage information to logging
        mem = res[0]
        logging.info(f"Memory Usage peak: {site}-{clf} {max(mem)* 1.048576} MB")
        logging.info(f"Memory Usage mean: {site}-{clf} {(sum(mem)/len(mem))* 1.048576} MB")

        # Plot memory usage
        plt.plot(*zip(*enumerate(mem)), label = f"{site}-{clf}")
    
print(utils.Timer.timers)
plt.legend()
plt.show()

In [29]:
all_tax = {}
for site in in_files.keys():
    # Rep-seqs
    rep_seqs_file = in_files[site]['rep-seqs']
    rep_seqs = utils.import_qiime_artifact(rep_seqs_file)
    
    utils.check_dir('real_datasets')
    
    tax_nb = get_taxonomy(
                rep_seqs_artifact = rep_seqs, classifier_paths = classifiers_paths,
                out_dir = 'real_datasets', method = "naive-bayes", threads=4,
                mock_community=False, p_confidence = 'disable', force=False,
                label=site)
    all_tax[site] = tax_nb

## Taxa-Barplots

In [31]:
lvl = 7
for site, d in all_tax.items():
    tab_file = in_files[site]['table']
    tab = utils.import_qiime_artifact(tab_file)
    
    for key, elements in d.items():           
        tax_path = elements[1]
        tax_artifact = elements[0]

        tb_path = join(dirname(tax_path),f"taxa-barplot-L{lvl}_{key}_{site}")
        tsv_tab = f"{tb_path}.tsv"
        print(site)

        mock_community.generate_taxa_barplots(
            tax_artifact = tax_artifact, tab_artifact = tab,
            level = lvl, output = tsv_tab , force = False
        )

gut
gut
gut
gut
gut
vagina
vagina
vagina
vagina
vagina
