In [8]:
import os
import pandas as pd
from qiime2 import Visualization
import matplotlib.pyplot as plt
import numpy as np

import qiime2 as q2

data_dir = "data"
data_or = "../data"
database_dir = "database"
ASVdata = "../ASV/data"

**Explanation**\
It is here a bit difficult with the data paths: The data_dir is for the data generated here, the data_or is for the input sequences and metadata (in the data folder), the database_dir is for the steps of database curation, ASVdata is for the files generated in the clustering notebook.

**Silva database curation**

In [25]:
! qiime rescript get-silva-data \
    --p-version '138' \
    --p-target 'SSURef_NR99' \
    --p-include-species-labels \
    --o-silva-sequences $database_dir/silva-138-ssu-nr99-seqs.qza \
    --o-silva-taxonomy $database_dir/silva-138-ssu-nr99-tax.qza

[32mSaved FeatureData[RNASequence] to: database/silva-138-ssu-nr99-seqs.qza[0m
[32mSaved FeatureData[Taxonomy] to: database/silva-138-ssu-nr99-tax.qza[0m
[0m

In [26]:
! qiime rescript cull-seqs \
     --i-sequences $database_dir/silva-138-ssu-nr99-seqs.qza \
     --p-num-degenerates 5 \
     --p-homopolymer-length 8 \
     --p-n-jobs 3 \
     --o-clean-sequences $database_dir/silva-138-ssu-nr99-seqs-cleaned.qza

[32mSaved FeatureData[Sequence] to: database/silva-138-ssu-nr99-seqs-cleaned.qza[0m
[0m

In [27]:
! qiime rescript filter-seqs-length-by-taxon \
    --i-sequences $database_dir/silva-138-ssu-nr99-seqs-cleaned.qza \
    --i-taxonomy $database_dir/silva-138-ssu-nr99-tax.qza \
    --p-labels Archaea Bacteria Eukaryota \
    --p-min-lens 900 1200 1400 \
    --o-filtered-seqs $database_dir/silva-138-ssu-nr99-seqs-filt.qza \
    --o-discarded-seqs $database_dir/silva-138-ssu-nr99-seqs-discard.qza

[32mSaved FeatureData[Sequence] to: database/silva-138-ssu-nr99-seqs-filt.qza[0m
[32mSaved FeatureData[Sequence] to: database/silva-138-ssu-nr99-seqs-discard.qza[0m
[0m

In [28]:
! qiime rescript dereplicate \
    --i-sequences $database_dir/silva-138-ssu-nr99-seqs-filt.qza  \
    --i-taxa $database_dir/silva-138-ssu-nr99-tax.qza \
    --p-rank-handles 'silva' \
    --p-mode 'uniq' \
    --p-threads 3 \
    --o-dereplicated-sequences $database_dir/silva-138-ssu-nr99-seqs-derep-uniq.qza \
    --o-dereplicated-taxa $database_dir/silva-138-ssu-nr99-tax-derep-uniq.qza

[32mSaved FeatureData[Sequence] to: database/silva-138-ssu-nr99-seqs-derep-uniq.qza[0m
[32mSaved FeatureData[Taxonomy] to: database/silva-138-ssu-nr99-tax-derep-uniq.qza[0m
[0m

In [3]:
#! pip install https://github.com/bokulich-lab/RESCRIPt.git 

Collecting https://github.com/bokulich-lab/RESCRIPt.git
  Downloading https://github.com/bokulich-lab/RESCRIPt.git
[2K     [32m-[0m [32m222.7 kB[0m [31m7.9 MB/s[0m [33m0:00:00[0m
[?25h[31m  ERROR: Cannot unpack file /tmp/pip-unpack-awapyz8a/RESCRIPt.git (downloaded from /tmp/pip-req-build-9l72e62g, content-type: text/html; charset=utf-8); cannot detect archive format[0m[31m
[0m[31mERROR: Cannot determine archive format of /tmp/pip-req-build-9l72e62g[0m[31m
[0m

In [29]:
#PCR region extraction, Primers unknown
#! qiime rescript extract-seq-segments \
#    --i-input-sequences $data_or/sequences_demux_paired.qza \
#    --i-reference-segment-sequences $database_dir/silva-138-ssu-nr99-tax-derep-uniq.qza \
#    --p-perc-identity 0.7 \
#    --p-min-seq-len 10 \
#    --p-threads 3 \
#    --o-extracted-sequence-segments data_dir/silva-138-ssu-nr99-tax-derep-uniq-extracted.qza \
#    --o-unmatched-sequences data_dir/silva-138-ssu-nr99-tax-derep-uniq-unmached.qza \
#    --verbose

In [30]:
#! qiime rescript --version

In [None]:
#training taxanomic classifier
#! qiime feature-classifier fit-classifier-naive-bayes \
#   --i-reference-reads $data_dir/silva-138-ssu-nr99-seqs-515f-806r-uniq.qza \
#   --i-reference-taxonomy $data_dir/silva-138-ssu-nr99-tax-515f-806r-derep-uniq.qza \
#   --o-classifier $data_dir/PJNB-515f-806r-classifier.qza

In [10]:
! wget -nv -O $database_dir/515f-806r-classifier.qza https://data.qiime2.org/2021.4/common/gg-13-8-99-515-806-nb-classifier.qza

2022-10-18 11:23:42 URL:https://s3-us-west-2.amazonaws.com/qiime2-data/2021.4/common/gg-13-8-99-515-806-nb-classifier.qza [28289645/28289645] -> "database/515f-806r-classifier.qza" [1]


In [7]:
! qiime tools peek $ASVdata/PJNB_dada2_rep_set.qza

[32mUUID[0m:        8f33c8bc-40ef-4dab-aafd-f4d3d817a474
[32mType[0m:        FeatureData[Sequence]
[32mData format[0m: DNASequencesDirectoryFormat


In [11]:
! qiime tools peek $database_dir/515f-806r-classifier.qza

[32mUUID[0m:        4b2a57b7-1e5a-4a4d-8201-99551ab50858
[32mType[0m:        TaxonomicClassifier
[32mData format[0m: TaxonomicClassiferTemporaryPickleDirFmt


In [12]:
#taxonomic classification
! qiime feature-classifier classify-sklearn \
    --i-classifier $database_dir/515f-806r-classifier.qza \
    --i-reads $ASVdata/PJNB_dada2_rep_set.qza \
    --o-classification $data_dir/taxonomy_classification.qza

[32mSaved FeatureData[Taxonomy] to: data/taxonomy_classification.qza[0m
[0m

In [13]:
#visualization
! qiime metadata tabulate \
    --m-input-file $data_dir/taxonomy_classification.qza \
    --o-visualization $data_dir/taxonomy_classification.qzv

[32mSaved Visualization to: data/taxonomy_classification.qzv[0m
[0m

In [14]:
Visualization.load(f'{data_dir}/taxonomy_classification.qzv')

In [18]:
! qiime taxa barplot \
    --i-table $ASVdata/PJNB_dada2_table_.qza \
    --i-taxonomy $data_dir/taxonomy_classification.qza \
    --m-metadata-file $data_or/metadata.tsv \
    --o-visualization $data_dir/taxa-prefiltered-bar-plots.qzv

[32mSaved Visualization to: data/taxa-prefiltered-bar-plots.qzv[0m
[0m

In [20]:
Visualization.load(f'{data_dir}/taxa-prefiltered-bar-plots.qzv')

In [21]:
#filter sequences and table for mitochondria
! qiime taxa filter-table \
    --i-table $ASVdata/PJNB_dada2_table_.qza \
    --i-taxonomy $data_dir/taxonomy_classification.qza \
    --p-exclude mitochondria,chloroplast \
    --o-filtered-table $data_dir/PJNB_dada2_table-filtered.qza

! qiime taxa filter-seqs \
    --i-sequences $ASVdata/PJNB_dada2_rep_set.qza \
    --i-taxonomy $data_dir/taxonomy_classification.qza \
    --p-exclude mitochondria \
    --o-filtered-sequences $data_dir/PJNB_dada2_rep_set-filtered.qza

[32mSaved FeatureTable[Frequency] to: data/PJNB_dada2_table-filtered.qza[0m
[0m[32mSaved FeatureData[Sequence] to: data//PJNB_dada2_rep_set-filtered.qza[0m
[0m

In [23]:
! qiime taxa barplot \
    --i-table $data_dir/PJNB_dada2_table-filtered.qza \
    --i-taxonomy $data_dir/taxonomy_classification.qza \
    --m-metadata-file $data_or/metadata.tsv \
    --o-visualization $data_dir/taxa-bar-plots-filtered.qzv

[32mSaved Visualization to: data/taxa-bar-plots-filtered.qzv[0m
[0m

In [24]:
Visualization.load(f'{data_dir}/taxa-bar-plots-filtered.qzv')

**GTDB database**

In [14]:
# Bacteria
! wget -nv -O $database_dir/bac120_taxonomy_r202.tsv.gz https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_taxonomy_r202.tsv.gz
! gunzip $database_dir/bac120_taxonomy_r202.tsv.gz
! wget -nv -O $database_dir/bac120_ssu_reps_r202.tar.gz https://data.gtdb.ecogenomic.org/releases/release202/202.0/genomic_files_reps/bac120_ssu_reps_r202.tar.gz
! tar -xvf $database_dir/bac120_ssu_reps_r202.tar.gz -C $database_dir/

2022-10-19 09:53:20 URL:https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_taxonomy_r202.tsv.gz [2469383/2469383] -> "database/bac120_taxonomy_r202.tsv.gz" [1]
2022-10-19 09:53:42 URL:https://data.gtdb.ecogenomic.org/releases/release202/202.0/genomic_files_reps/bac120_ssu_reps_r202.tar.gz [10949720/10949720] -> "database/bac120_ssu_reps_r202.tar.gz" [1]
bac120_ssu_reps_r202.fna


In [15]:
#Archaea
! wget -nv -O $database_dir/ar122_taxonomy_r202.tsv.gz https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_taxonomy_r202.tsv.gz
! gunzip $database_dir/ar122_taxonomy_r202.tsv.gz

! wget -nv -O $database_dir/ar122_ssu_reps_r202.tar.gz https://data.gtdb.ecogenomic.org/releases/release202/202.0/genomic_files_reps/ar122_ssu_reps_r202.tar.gz
! tar -xvf  $database_dir/ar122_ssu_reps_r202.tar.gz -C $database_dir/

2022-10-19 09:54:04 URL:https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_taxonomy_r202.tsv.gz [63238/63238] -> "database/ar122_taxonomy_r202.tsv.gz" [1]
2022-10-19 09:54:09 URL:https://data.gtdb.ecogenomic.org/releases/release202/202.0/genomic_files_reps/ar122_ssu_reps_r202.tar.gz [541133/541133] -> "database/ar122_ssu_reps_r202.tar.gz" [1]
ar122_ssu_reps_r202.fna


In [16]:
# Bacteria
! qiime tools import \
    --input-path $database_dir/bac120_ssu_reps_r202.fna \
    --type 'FeatureData[Sequence]' \
    --output-path $database_dir/bact_seqs.qza

! qiime tools import \
    --input-path $database_dir/bac120_taxonomy_r202.tsv \
    --type 'FeatureData[Taxonomy]' \
    --input-format 'HeaderlessTSVTaxonomyFormat' \
    --output-path $database_dir/bact_tax.qza


# Archaea
! qiime tools import \
    --input-path $database_dir/ar122_ssu_reps_r202.fna \
    --type 'FeatureData[Sequence]' \
    --output-path $database_dir/arch_seqs.qza

! qiime tools import \
      --input-path $database_dir/ar122_taxonomy_r202.tsv \
      --type 'FeatureData[Taxonomy]' \
      --input-format 'HeaderlessTSVTaxonomyFormat' \
      --output-path $database_dir/arch_tax.qza

[32mImported database/bac120_ssu_reps_r202.fna as DNASequencesDirectoryFormat to database/bact_seqs.qza[0m
[0m[32mImported database/bac120_taxonomy_r202.tsv as HeaderlessTSVTaxonomyFormat to database/bact_tax.qza[0m
[0m[32mImported database/ar122_ssu_reps_r202.fna as DNASequencesDirectoryFormat to database/arch_seqs.qza[0m
[0m[32mImported database/ar122_taxonomy_r202.tsv as HeaderlessTSVTaxonomyFormat to database/arch_tax.qza[0m
[0m

In [None]:
! qiime feature-table merge-taxa \
      --i-data $database_dir/bact_tax.qza arch_tax.qza \
      --o-merged-data $database_dir/gtdb_tax.qza

! qiime feature-table merge-seqs \
      --i-data $database_dir/bact_seqs.qza arch_seqs.qza \
      --o-merged-data $database_dir/gtdb_seqs.qza

Usage: [94mqiime feature-table merge-taxa[0m [OPTIONS]

  Combines a pair of feature data objects which may or may not contain data
  for the same features. If different feature data is present for the same
  feature id in the inputs, the data from the first will be propagated to
  the result.

[1mInputs[0m:
  [94m[4m--i-data[0m ARTIFACTS... [32mList[FeatureData[Taxonomy]][0m
                         The collection of feature taxonomies to be merged.
                                                                    [35m[required][0m
[1mOutputs[0m:
  [94m[4m--o-merged-data[0m ARTIFACT [32mFeatureData[Taxonomy][0m
                         The resulting collection of feature taxonomies
                         containing all feature taxonomies provided.
                                                                    [35m[required][0m
[1mMiscellaneous[0m:
  [94m--output-dir[0m PATH      Output unspecified results to a directory
  [94m--verbose[0m / [94m--qu