In [2]:
import os
import pandas as pd
from qiime2 import Visualization
import matplotlib.pyplot as plt
import numpy as np

import qiime2 as q2

data_dir = "data"
data_or = "../data"
database_dir = "database"
ASVdata = "../ASV/data"
curatedb = False #already ran it before so can just get it from polybox.

if (curatedb == True):
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir)

    if not os.path.isdir(database_dir):
        os.makedirs(database_dir)

**Explanation**\
It is here a bit difficult with the data paths: The data_dir is for the data generated here, the data_or is for the input sequences and metadata (in the data folder), the database_dir is for the steps of database curation, ASVdata is for the files generated in the clustering notebook.

**Silva database curation**

In [7]:
! qiime rescript get-silva-data \
    --p-version '138' \
    --p-target 'SSURef_NR99' \
    --p-include-species-labels \
    --o-silva-sequences $database_dir/silva-138-ssu-nr99-seqs.qza \
    --o-silva-taxonomy $database_dir/silva-138-ssu-nr99-tax.qza

[32mSaved FeatureData[RNASequence] to: database/silva-138-ssu-nr99-seqs.qza[0m
[32mSaved FeatureData[Taxonomy] to: database/silva-138-ssu-nr99-tax.qza[0m
[0m

In [8]:
! qiime rescript cull-seqs \
     --i-sequences $database_dir/silva-138-ssu-nr99-seqs.qza \
     --p-num-degenerates 5 \
     --p-homopolymer-length 8 \
     --p-n-jobs 3 \
     --o-clean-sequences $database_dir/silva-138-ssu-nr99-seqs-cleaned.qza

[32mSaved FeatureData[Sequence] to: database/silva-138-ssu-nr99-seqs-cleaned.qza[0m
[0m

In [27]:
! qiime rescript filter-seqs-length-by-taxon \
    --i-sequences $database_dir/silva-138-ssu-nr99-seqs-cleaned.qza \
    --i-taxonomy $database_dir/silva-138-ssu-nr99-tax.qza \
    --p-labels Archaea Bacteria Eukaryota \
    --p-min-lens 900 1200 1400 \
    --o-filtered-seqs $database_dir/silva-138-ssu-nr99-seqs-filt.qza \
    --o-discarded-seqs $database_dir/silva-138-ssu-nr99-seqs-discard.qza

[32mSaved FeatureData[Sequence] to: database/silva-138-ssu-nr99-seqs-filt.qza[0m
[32mSaved FeatureData[Sequence] to: database/silva-138-ssu-nr99-seqs-discard.qza[0m
[0m

In [28]:
! qiime rescript dereplicate \
    --i-sequences $database_dir/silva-138-ssu-nr99-seqs-filt.qza  \
    --i-taxa $database_dir/silva-138-ssu-nr99-tax.qza \
    --p-rank-handles 'silva' \
    --p-mode 'uniq' \
    --p-threads 3 \
    --o-dereplicated-sequences $database_dir/silva-138-ssu-nr99-seqs-derep-uniq.qza \
    --o-dereplicated-taxa $database_dir/silva-138-ssu-nr99-tax-derep-uniq.qza

[32mSaved FeatureData[Sequence] to: database/silva-138-ssu-nr99-seqs-derep-uniq.qza[0m
[32mSaved FeatureData[Taxonomy] to: database/silva-138-ssu-nr99-tax-derep-uniq.qza[0m
[0m

**Only run this if did not run the above and do not run the mkdir above as same folder names**

In [10]:
if (curatedb == False):
    ! wget -nv -O data.zip 'https://polybox.ethz.ch/index.php/s/pNA39R0rl2xMMj9/download'
    ! unzip -q data.zip #-d $data_dir
    ! mv data data2
    ! mv data2/taxonomy/data .
    ! mv data2/taxonomy/database .
    ! rm -r data2
    ! rm data.zip

#! wget -nv -O $data_dir/data.zip 'https://polybox.ethz.ch/index.php/s/pNA39R0rl2xMMj9/download'
#! unzip -q $data_dir/data.zip -d $data_dir
#! rm $data_dir/data.zip

2022-10-20 09:30:45 URL:https://polybox.ethz.ch/index.php/s/pNA39R0rl2xMMj9/download [894620633] -> "data.zip" [1]
replace data/data/ASV/ASV_PJNB.ipynb? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
rm: cannot remove 'data.zip': No such file or directory


In [29]:
#PCR region extraction, Primers unknown
#! qiime rescript extract-seq-segments \
#    --i-input-sequences $data_or/sequences_demux_paired.qza \
#    --i-reference-segment-sequences $database_dir/silva-138-ssu-nr99-tax-derep-uniq.qza \
#    --p-perc-identity 0.7 \
#    --p-min-seq-len 10 \
#    --p-threads 3 \
#    --o-extracted-sequence-segments data_dir/silva-138-ssu-nr99-tax-derep-uniq-extracted.qza \
#    --o-unmatched-sequences data_dir/silva-138-ssu-nr99-tax-derep-uniq-unmached.qza \
#    --verbose

In [30]:
#! qiime rescript --version

In [10]:
#training taxanomic classifier
# doesn't work, just stops, no output no error, WHY?
# ! qiime feature-classifier fit-classifier-naive-bayes \
#   --i-reference-reads $database_dir/silva-138-ssu-nr99-seqs-derep-uniq.qza \
#   --i-reference-taxonomy $database_dir/silva-138-ssu-nr99-tax-derep-uniq.qza \
#   --o-classifier $database_dir/ML-515f-806r-classifier.qza

In [11]:
! wget -nv -O $database_dir/515f-806r-classifier.qza https://data.qiime2.org/2021.4/common/gg-13-8-99-515-806-nb-classifier.qza

2022-10-20 10:17:42 URL:https://s3-us-west-2.amazonaws.com/qiime2-data/2021.4/common/gg-13-8-99-515-806-nb-classifier.qza [28289645/28289645] -> "database/515f-806r-classifier.qza" [1]


In [17]:
! qiime tools peek $ASVdata/PJNB_dada2_rep_set.qza

[32mUUID[0m:        8f33c8bc-40ef-4dab-aafd-f4d3d817a474
[32mType[0m:        FeatureData[Sequence]
[32mData format[0m: DNASequencesDirectoryFormat


In [18]:
! qiime tools peek $database_dir/515f-806r-classifier.qza

[32mUUID[0m:        4b2a57b7-1e5a-4a4d-8201-99551ab50858
[32mType[0m:        TaxonomicClassifier
[32mData format[0m: TaxonomicClassiferTemporaryPickleDirFmt


In [12]:
#taxonomic classification
! qiime feature-classifier classify-sklearn \
    --i-classifier $database_dir/515f-806r-classifier.qza \
    --i-reads $ASVdata/PJNB_dada2_rep_set.qza \
    --o-classification $data_dir/taxonomy_classification.qza

[32mSaved FeatureData[Taxonomy] to: data/taxonomy_classification.qza[0m
[0m

In [13]:
#visualization
! qiime metadata tabulate \
    --m-input-file $data_dir/taxonomy_classification.qza \
    --o-visualization $data_dir/taxonomy_classification.qzv

[32mSaved Visualization to: data/taxonomy_classification.qzv[0m
[0m

In [3]:
Visualization.load(f'{data_dir}/taxonomy_classification.qzv')

In [18]:
! qiime taxa barplot \
    --i-table $ASVdata/PJNB_dada2_table_.qza \
    --i-taxonomy $data_dir/taxonomy_classification.qza \
    --m-metadata-file $data_or/metadata.tsv \
    --o-visualization $data_dir/taxa-prefiltered-bar-plots.qzv

[32mSaved Visualization to: data/taxa-prefiltered-bar-plots.qzv[0m
[0m

In [8]:
Visualization.load(f'{data_dir}/taxa-prefiltered-bar-plots.qzv')

In [21]:
#filter sequences and table for mitochondria
! qiime taxa filter-table \
    --i-table $ASVdata/PJNB_dada2_table_.qza \
    --i-taxonomy $data_dir/taxonomy_classification.qza \
    --p-exclude mitochondria,chloroplast \
    --o-filtered-table $data_dir/PJNB_dada2_table-filtered.qza

! qiime taxa filter-seqs \
    --i-sequences $ASVdata/PJNB_dada2_rep_set.qza \
    --i-taxonomy $data_dir/taxonomy_classification.qza \
    --p-exclude mitochondria \
    --o-filtered-sequences $data_dir/PJNB_dada2_rep_set-filtered.qza

[32mSaved FeatureTable[Frequency] to: data/PJNB_dada2_table-filtered.qza[0m
[0m[32mSaved FeatureData[Sequence] to: data//PJNB_dada2_rep_set-filtered.qza[0m
[0m

In [23]:
! qiime taxa barplot \
    --i-table $data_dir/PJNB_dada2_table-filtered.qza \
    --i-taxonomy $data_dir/taxonomy_classification.qza \
    --m-metadata-file $data_or/metadata.tsv \
    --o-visualization $data_dir/taxa-bar-plots-filtered.qzv

[32mSaved Visualization to: data/taxa-bar-plots-filtered.qzv[0m
[0m

In [9]:
Visualization.load(f'{data_dir}/taxa-bar-plots-filtered.qzv')

**GTDB database**