# Taxonomy Assigment

For each mock community, we will perform the taxonomy assignment.

## Environment preparation

In [1]:
import qiime2.plugins.taxa.actions as taxa_actions

import os
from os import system
from os.path import join, basename, dirname, exists, splitext
from glob import glob
from itertools import product

import logging

%load_ext autoreload

# Custom functions
from utils import check_dir
import utils
from db_comparison import get_taxonomy, get_classifier
import mock_community

Results directory

In [3]:
v_regions = ['V4', 'V3-V4']

paired = {
    'V4': False,
    'V3-V4': True
}

# mock_names = ['mockrobiota']
mock_names = ['gutmock', 'vagimock', 'mockrobiota']


db_spec = {
    'gutmock': {v_region: f"gsrv_{v_region}/gsrv_{v_region}/NS-otus" for v_region in v_regions},
    'vagimock': {v_region: f"gsrv_{v_region}/gsrv_{v_region}/NS-otus" for v_region in v_regions},
    'mockrobiota': {v_region: f"mockrobiota_{v_region}/mockrobiota_{v_region}/NS-otus" for v_region in v_regions}
    
}

Databases path

In [7]:
# Database files
created_dir = "new_approach/created_db"
or_dir = "new_approach/original_db"
databases = {}

# original databases
databases = {
    'silva_full-16S': {'seq': join(or_dir, 'silva_138_NR99_seqs.qza'),
             'taxa': join(or_dir, 'silva_138_NR99_taxa.qza')
                 },
    'gg_full-16S': {'seq': join(or_dir, "gg_13_8_NR99_seqs.qza"),
           'taxa': join(or_dir, "gg_13_8_NR99_taxa_spformat.qza")
              },
    'rdp_full-16S': { 'seq': join(or_dir, 'rdp_16S_seqs.qza' ),
            'taxa': join(or_dir, 'rdp_16S_taxa_spformat.qza' )
               },
    "itgdb_full-16S": {'seq': join(or_dir, "taxa_itgdb_seq.qza"),
              'taxa': join(or_dir, "taxa_itgdb_taxa_spformat.qza")
                 },
    "gtdb_full-16S": {'seq': join(or_dir, "gtdb_full-16S_seqs.qza"),
              'taxa': join(or_dir, "gtdb_full-16S_taxa.qza")
                 }
}

# add created databases
all_created_filenames = glob(join(created_dir, "*.qza"))

for filename in all_created_filenames:
    # skip gs databases
    if not basename(filename).startswith('gs99') and not basename(filename).startswith('gsv99'):
        if 'metasquare' in filename:
            continue
        # sort seq files and taxa files
        if filename.endswith('seqs.qza'):
            db = basename(filename).replace('_seqs.qza', '')
            databases.setdefault(db, {})
            databases[db]['seq'] = filename
        elif filename.endswith('taxa.qza'):
            db = basename(filename).replace('_taxa.qza', '')
            databases.setdefault(db, {})
            databases[db]['taxa'] = filename
        else:
            raise Exception(filename)

In [8]:
databases.keys()

dict_keys(['silva_full-16S', 'gg_full-16S', 'rdp_full-16S', 'itgdb_full-16S', 'gtdb_full-16S', 'gsrv_full-16S', 'gsrv_V4', 'gsrv_V3-V4', 'gsrv_V1-V3', 'gsrv_V3-V5', 'rdp_V4', 'rdp_V3-V4', 'rdp_V1-V3', 'rdp_V3-V5', 'silva_V4', 'silva_V3-V4', 'silva_V1-V3', 'silva_V3-V5', 'gg_V4', 'gg_V3-V4', 'gg_V1-V3', 'gg_V3-V5', 'itgdb_V4', 'itgdb_V3-V4', 'itgdb_V1-V3', 'itgdb_V3-V5', 'gsrv_V4_cluster-1', 'gsrv_V3-V4_cluster-0.99', 'gsrv_V3-V4_cluster-1', 'gsrv_V1-V3_cluster-0.99', 'gsrv_V1-V3_cluster-1', 'gsrv_V4_cluster-0.99', 'gsrv_V3-V5_cluster-0.99', 'gsrv_V3-V5_cluster-1', 'gsrv_full-16S_filt', 'gtdb_V4', 'gtdb_V3-V4', 'gtdb_V1-V3', 'gtdb_V3-V5'])

Rep-seqs

In [9]:
mock_dir = "mock-community"

rep_seqs_files = {}
for mockname in mock_names:
    rep_seqs_files[mockname] = {}
    for v_region in v_regions:
        if paired[v_region]:
                    qiime_out = join(
            mock_dir, mockname , db_spec[mockname][v_region], 
            "source/qiime2/paired/")
        else:
            qiime_out = join(
                mock_dir, mockname , db_spec[mockname][v_region], 
                "source/qiime2/single/")

        rep_seqs_path = join(qiime_out, f"rep_seqs_{mockname}.qza")
#         rep_seqs_artifact[mockname][v_region] = Artifact.load(rep_seqs_path)
        rep_seqs_files[mockname][v_region] = rep_seqs_path

Qiime Tables

In [10]:
tabs_files = {}
for mockname in mock_names:
    tabs_files[mockname] = {}
    for v_region in v_regions:
        if paired[v_region]:
            qiime_out = join(mock_dir, mockname , 
                             db_spec[mockname][v_region], 
                             "source/qiime2/paired/")
        else:
            qiime_out = join(mock_dir, mockname , 
                 db_spec[mockname][v_region], 
                 "source/qiime2/single/")

        tab_path = join(qiime_out, f"table_{mockname}.qza")
#         tabs_artifact[mockname] = Artifact.load(tab_path)
        tabs_files[mockname][v_region] = tab_path

In [None]:
tabs_files

## QIIME2: NB classifiers

In [12]:
classifiers_paths = list(glob(join('classifiers', '*qza')))

## QIIME2: Taxonomy assignment

In [14]:
dataset_reference_combinations = {}

for v_region in v_regions:
#     reference_dbs = [dbs for dbs in databases.keys() if ('16S' in dbs or f"_{v_region}" in dbs)]
    reference_dbs = [dbs for dbs in databases.keys() if f"_{v_region}" in dbs]
    dataset_reference_combinations[v_region] = [element for element in product(mock_names, reference_dbs)]
    
dataset_reference_combinations

{'V4': [('gutmock', 'gsrv_V4'),
  ('gutmock', 'rdp_V4'),
  ('gutmock', 'silva_V4'),
  ('gutmock', 'gg_V4'),
  ('gutmock', 'itgdb_V4'),
  ('gutmock', 'gsrv_V4_cluster-1'),
  ('gutmock', 'gsrv_V4_cluster-0.99'),
  ('gutmock', 'gtdb_V4'),
  ('vagimock', 'gsrv_V4'),
  ('vagimock', 'rdp_V4'),
  ('vagimock', 'silva_V4'),
  ('vagimock', 'gg_V4'),
  ('vagimock', 'itgdb_V4'),
  ('vagimock', 'gsrv_V4_cluster-1'),
  ('vagimock', 'gsrv_V4_cluster-0.99'),
  ('vagimock', 'gtdb_V4'),
  ('mockrobiota', 'gsrv_V4'),
  ('mockrobiota', 'rdp_V4'),
  ('mockrobiota', 'silva_V4'),
  ('mockrobiota', 'gg_V4'),
  ('mockrobiota', 'itgdb_V4'),
  ('mockrobiota', 'gsrv_V4_cluster-1'),
  ('mockrobiota', 'gsrv_V4_cluster-0.99'),
  ('mockrobiota', 'gtdb_V4')],
 'V3-V4': [('gutmock', 'gsrv_V3-V4'),
  ('gutmock', 'rdp_V3-V4'),
  ('gutmock', 'silva_V3-V4'),
  ('gutmock', 'gg_V3-V4'),
  ('gutmock', 'itgdb_V3-V4'),
  ('gutmock', 'gsrv_V3-V4_cluster-0.99'),
  ('gutmock', 'gsrv_V3-V4_cluster-1'),
  ('gutmock', 'gtdb_V3-V4'),


In [17]:
%%time
%autoreload

confidences = ['disable', 0.5, 0.7, 0.9, 0.98]
threads = 35
method = "naive-bayes"

all_tax = {}

for v_region in v_regions:
    logging.basicConfig(filename=f'tax_assignment_V4-V3-V4.log', format='%(asctime)s %(message)s', 
                        filemode = 'w', level=logging.INFO)

    for element in dataset_reference_combinations[v_region]:
        taxonomies = {}

        # Combination
        mock_name = element[0]
        db_key = element[1]

        all_tax.setdefault(mock_name, {})
        all_tax[mock_name].setdefault(v_region, {})
        all_tax[mock_name][v_region].setdefault(db_key, {})

        # Rep-seqs
        rep_seqs_file = rep_seqs_files[mock_name][v_region]
        rep_seqs = utils.import_qiime_artifact(rep_seqs_file)


        # Classifier
        clf =[path for path in classifiers_paths if db_key in path]

        # Confidence value
        for conf in confidences:
            tax_nb = get_taxonomy(
                rep_seqs_artifact = rep_seqs, classifier_paths = clf, 
                out_dir = join(mock_name, v_region), method = method, threads=threads,
                mock_community=True, p_confidence = conf, force=False,
                label=mock_name)

            all_tax[mock_name][v_region][db_key].update({key: item[1] for key, item in tax_nb.items()})

CPU times: user 35min 22s, sys: 5min 55s, total: 41min 17s
Wall time: 58min 46s


In [18]:
all_tax

{'gutmock': {'V4': {'gsrv_V4': {'gsrv_V4_cluster-1_0.001::[6,6]:disable': 'gutmock/V4/gsrv_V4_cluster-1/naive-bayes/0.001::[6,6]:disable/taxonomy_gsrv_V4_cluster-1_0.001::[6,6]:disable_gutmock.qza',
    'gsrv_V4_0.001::[6,6]:disable': 'gutmock/V4/gsrv_V4/naive-bayes/0.001::[6,6]:disable/taxonomy_gsrv_V4_0.001::[6,6]:disable_gutmock.qza',
    'gsrv_V4_cluster-0.99_0.001::[6,6]:disable': 'gutmock/V4/gsrv_V4_cluster-0.99/naive-bayes/0.001::[6,6]:disable/taxonomy_gsrv_V4_cluster-0.99_0.001::[6,6]:disable_gutmock.qza',
    'gsrv_V4_cluster-1_0.001::[7,7]:disable': 'gutmock/V4/gsrv_V4_cluster-1/naive-bayes/0.001::[7,7]:disable/taxonomy_gsrv_V4_cluster-1_0.001::[7,7]:disable_gutmock.qza',
    'gsrv_V4_0.001::[7,7]:disable': 'gutmock/V4/gsrv_V4/naive-bayes/0.001::[7,7]:disable/taxonomy_gsrv_V4_0.001::[7,7]:disable_gutmock.qza',
    'gsrv_V4_cluster-0.99_0.001::[7,7]:disable': 'gutmock/V4/gsrv_V4_cluster-0.99/naive-bayes/0.001::[7,7]:disable/taxonomy_gsrv_V4_cluster-0.99_0.001::[7,7]:disable_gu

## Taxa Barplots

In [None]:
%autoreload
levels = [1,2,3,4,5,6,7]
force=False

for lvl in levels:
    print(f"{lvl=}")
    for mockname, items in all_tax.items():
        print(f"{mockname=}")
        for v_region, val in items.items():
            print(f"{v_region=}")
            tab_file = tabs_files[mockname][v_region]
            tab = utils.import_qiime_artifact(tab_file)
            for db, elements in val.items():
                for key, tax in elements.items():           
                    tax_path = tax
                    tax_artifact = utils.import_qiime_artifact(tax)

#                     print(tax_path)
                    tb_path = join(dirname(tax_path),f"taxa-barplot-L{lvl}_{key}_{mockname}")
                    tsv_tab = f"{tb_path}.tsv"
#                     print(tsv_tab)

                    mock_community.generate_taxa_barplots(
                        tax_artifact = tax_artifact, tab_artifact = tab,
                        level = lvl, output = tsv_tab , force = force
                    )
                    