# 3. Taxonomic Classification
## Import data & packages

In [1]:
# 1 - Import all packages
import IPython
import pandas as pd
import matplotlib.pyplot as plt
import os
import qiime2 as q2
from qiime2 import Visualization

%matplotlib inline

In [2]:
# 2 - Set working directory
os.chdir("/home/jovyan/MicrobiomeAnalysis_TummyTribe/")

# Verify that your wroking directory is the overall project folder (.../MicrobiomeAnalysis_TummyTribe)
print("Current working directory:", os.getcwd())

Current working directory: /home/jovyan/MicrobiomeAnalysis_TummyTribe


In [3]:
# 3 - Data directory for the raw data
data_dir = "data/raw"
processed_data_dir = "data/processed"

##  Set-up SILVA (ran, but not looked into it in detail yet if the generic approach fits for us)

Assign taxonomy using a SILVA v4 pre-trained classifier (as said in project description)
- https://forum.qiime2.org/t/processing-filtering-and-evaluating-the-silva-database-and-other-reference-sequence-data-with-rescript/15494## Import data & packages

In [4]:
# also takes a long time to run all these cells! Maybe 

! qiime rescript get-silva-data \
    --p-version '138.2' \
    --p-target 'SSURef_NR99' \
    --o-silva-sequences  $processed_data_dir/silva-138.2-ssu-nr99-rna-seqs.qza \
    --o-silva-taxonomy $processed_data_dir/silva-138.2-ssu-nr99-tax.qza

  import pkg_resources
[32mSaved FeatureData[RNASequence] to: data/processed/silva-138.2-ssu-nr99-rna-seqs.qza[0m
[32mSaved FeatureData[Taxonomy] to: data/processed/silva-138.2-ssu-nr99-tax.qza[0m
[0m[?25h

In [7]:
! qiime rescript reverse-transcribe \
    --i-rna-sequences $processed_data_dir/silva-138.2-ssu-nr99-rna-seqs.qza \
    --o-dna-sequences $processed_data_dir/silva-138.2-ssu-nr99-seqs.qza

  import pkg_resources
[32mSaved FeatureData[Sequence] to: data/processed/silva-138.2-ssu-nr99-seqs.qza[0m
[0m[?25h

In [14]:
! qiime rescript cull-seqs \
    --i-sequences $processed_data_dir/silva-138.2-ssu-nr99-seqs.qza \
    --o-clean-sequences $processed_data_dir/silva-138.2-ssu-nr99-seqs-cleaned.qza

  import pkg_resources
[32mSaved FeatureData[Sequence] to: data/processed/silva-138.2-ssu-nr99-seqs-cleaned.qza[0m
[0m[?25h

In [15]:
! qiime rescript filter-seqs-length-by-taxon \
    --i-sequences $processed_data_dir/silva-138.2-ssu-nr99-seqs-cleaned.qza \
    --i-taxonomy $processed_data_dir/silva-138.2-ssu-nr99-tax.qza \
    --p-labels Archaea Bacteria Eukaryota \
    --p-min-lens 900 1200 1400 \
    --o-filtered-seqs $processed_data_dir/silva-138.2-ssu-nr99-seqs-filt.qza \
    --o-discarded-seqs $processed_data_dir/silva-138.2-ssu-nr99-seqs-discard.qza 

  import pkg_resources
[32mSaved FeatureData[Sequence] to: data/processed/silva-138.2-ssu-nr99-seqs-filt.qza[0m
[32mSaved FeatureData[Sequence] to: data/processed/silva-138.2-ssu-nr99-seqs-discard.qza[0m
[0m[?25h

In [16]:
! qiime rescript dereplicate \
    --i-sequences $processed_data_dir/silva-138.2-ssu-nr99-seqs-filt.qza  \
    --i-taxa $processed_data_dir/silva-138.2-ssu-nr99-tax.qza \
    --p-mode 'uniq' \
    --o-dereplicated-sequences $processed_data_dir/silva-138.2-ssu-nr99-seqs-derep-uniq.qza \
    --o-dereplicated-taxa $processed_data_dir/silva-138.2-ssu-nr99-tax-derep-uniq.qza

  import pkg_resources
[32mSaved FeatureData[Sequence] to: data/processed/silva-138.2-ssu-nr99-seqs-derep-uniq.qza[0m
[32mSaved FeatureData[Taxonomy] to: data/processed/silva-138.2-ssu-nr99-tax-derep-uniq.qza[0m
[0m[?25h

In [20]:
! qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads  $processed_data_dir/silva-138.2-ssu-nr99-seqs-derep-uniq.qza \
  --i-reference-taxonomy $processed_data_dir/silva-138.2-ssu-nr99-tax-derep-uniq.qza \
  --p-classify--chunk-size 10 \
  --o-classifier $processed_data_dir/silva-138.2-ssu-nr99-classifier.qza \
  if [ ! $? -eq 0 ]; then echo "Command failed - most likely reason is not enough memory."; fi

/usr/bin/sh: -c: line 1: syntax error near unexpected token `then'
/usr/bin/sh: -c: line 1: ` qiime feature-classifier fit-classifier-naive-bayes    --i-reference-reads  data/processed/silva-138.2-ssu-nr99-seqs-derep-uniq.qza    --i-reference-taxonomy data/processed/silva-138.2-ssu-nr99-tax-derep-uniq.qza    --p-classify--chunk-size 10    --o-classifier data/processed/silva-138.2-ssu-nr99-classifier.qza    if [ ! $? -eq 0 ]; then echo "Command failed - most likely reason is not enough memory."; fi'


##  Assign Taxonomy

In [None]:
! qiime feature-classifier classify-sklearn \
    --i-classifier $processed_data_dir/silva-138.2-ssu-nr99-classifier.qza \
    --i-reads $processed_data_dir/dada2_rep_set.qza \
    --o-classification $processed_data_dir/taxonomy.qza

In [None]:
! qiime tools peek $processed_data_dir/taxonomy.qza

In [None]:
! qiime metadata tabulate \
    --m-input-file $processed_data_dir/taxonomy.qza \
    --o-visualization $processed_data_dir/taxonomy.qzv

In [None]:
Visualization.load(f"{processed_data_dir}/taxonomy.qzv")

In [None]:
#Left at this step in the notebook

# ! qiime taxa barplot \
#     --i-table $data_dir/table.qza \
#     --i-taxonomy $data_dir/taxonomy.qza \
#     --m-metadata-file $data_dir/sample-metadata.tsv \
#     --o-visualization $data_dir/taxa-bar-plots.qzv