# 2. Quality control and denoising
## Import Data & Packages

In [2]:
# 1 - Import all packages
import IPython
import pandas as pd
from qiime2 import Visualization
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [3]:
# 2 - Set working directory --> change this so it matches your path to the project folder
os.chdir("/home/jovyan/MicrobiomeAnalysis_TummyTribe/")

# Verify that your wroking directory is the overall project folder (.../MicrobiomeAnalysis_TummyTribe)
print("Current working directory:", os.getcwd())

Current working directory: /home/jovyan/MicrobiomeAnalysis_TummyTribe


In [4]:
# 3 - Data directories
data_in = "data/raw"
data_out = "data/preprocessing"
results_dir = "results/preprocessing"

In [4]:
# 4 - Have a look at the sequencing data
! qiime tools peek $data_in/sequences-demux-paired.qza

[32mUUID[0m:        b4782ab7-550b-41f5-b906-ca2cda29ca9b
[32mType[0m:        SampleData[PairedEndSequencesWithQuality]
[32mData format[0m: SingleLanePerSamplePairedEndFastqDirFmt


## Quality Control

In [5]:
! qiime demux summarize \
    --i-data $data_in/sequences-demux-paired.qza \
    --o-visualization $results_dir/raw-QC.qzv

  import pkg_resources
[32mSaved Visualization to: results/preprocessing/raw-QC.qzv[0m
[0m[?25h

In [5]:
Visualization.load(f"{results_dir}/raw-QC.qzv")

## Denoising - Amplicon Sequence Variants

In [7]:
! qiime dada2 denoise-paired !help

Usage: [94mqiime dada2 denoise-paired[0m [OPTIONS]

  This method denoises paired-end sequences, dereplicates them, and filters
  chimeras.

[1mInputs[0m:
  [94m[4m--i-demultiplexed-seqs[0m ARTIFACT [32mSampleData[PairedEndSequencesWithQuality][0m
                          The paired-end demultiplexed sequences to be
                          denoised.                                 [35m[required][0m
[1mParameters[0m:
  [94m[4m--p-trunc-len-f[0m INTEGER Position at which forward read sequences should be
                          truncated due to decrease in quality. This truncates
                          the 3' end of the of the input sequences, which will
                          be the bases that were sequenced in the last cycles.
                          Reads that are shorter than this value will be
                          discarded. After this parameter is applied there
                          must still be at least a 12 nucleotide overlap
                

In [8]:
! qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $data_in/sequences-demux-paired.qza \
    --p-trunc-len-f 140 \
    --p-trunc-len-r 140 \
    --p-n-threads 3 \
    --o-table $data_out/dada2_table.qza \
    --o-representative-sequences $data_out/dada2_rep_seq.qza \
    --o-denoising-stats $data_out/dada2_stats.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: data/preprocessing/dada2_table.qza[0m
[32mSaved FeatureData[Sequence] to: data/preprocessing/dada2_rep_seq.qza[0m
[32mSaved SampleData[DADA2Stats] to: data/preprocessing/dada2_stats.qza[0m
[0m[?25h

In [9]:
! qiime metadata tabulate \
    --m-input-file $data_out/dada2_stats.qza \
    --o-visualization $results_dir/dada2_stats.qzv

  import pkg_resources
[32mSaved Visualization to: results/preprocessing/dada2_stats.qzv[0m
[0m[?25h

In [10]:
Visualization.load(f"{results_dir}/dada2_stats.qzv")

### Feature Table

In [13]:
! qiime feature-table summarize \
    --i-table $data_out/dada2_table.qza \
    --m-sample-metadata-file $data_in/metadata.tsv \
    --o-visualization $results_dir/dada2_table.qzv

  import pkg_resources
[32mSaved Visualization to: results/preprocessing/dada2_table.qzv[0m
[0m[?25h

In [12]:
Visualization.load(f"{results_dir}/dada2_table.qzv")

### With trunc 150

In [None]:
! qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $data_in/sequences-demux-paired.qza \
    --p-trunc-len-f 150 \
    --p-trunc-len-r 150 \
    --p-n-threads 3 \
    --o-table $data_out/dada2_table_150.qza \
    --o-representative-sequences $data_out/dada2_rep_seq_150.qza \
    --o-denoising-stats $data_out/dada2_stats_150.qza

In [None]:
! qiime metadata tabulate \
    --m-input-file $data_out/dada2_stats_150.qza \
    --o-visualization $results_dir/dada2_stats_150.qzv

In [None]:
Visualization.load(f"{results_dir}/dada2_stats_150.qzv")

In [None]:
! qiime feature-table summarize \
    --i-table $data_out/dada2_table_150.qza \
    --m-sample-metadata-file $data_in/metadata_150.tsv \
    --o-visualization $results_dir/dada2_table_150.qzv

In [None]:
Visualization.load(f"{results_dir}/dada2_table_150.qzv")

### with trunc 160/130

In [None]:
! qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $data_in/sequences-demux-paired.qza \
    --p-trunc-len-f 160 \
    --p-trunc-len-r 130 \
    --p-n-threads 3 \
    --o-table $data_out/dada2_table_160_130.qza \
    --o-representative-sequences $data_out/dada2_rep_seq_160_130.qza \
    --o-denoising-stats $data_out/dada2_stats_160_130.qza

In [None]:
! qiime metadata tabulate \
    --m-input-file $data_out/dada2_stats_160_130.qza \
    --o-visualization $results_dir/dada2_stats_160_130.qzv

In [None]:
Visualization.load(f"{results_dir}/dada2_stats_160_130.qzv")

In [None]:
! qiime feature-table summarize \
    --i-table $data_out/dada2_table_160_130.qza \
    --m-sample-metadata-file $data_in/metadata_160_130.tsv \
    --o-visualization $results_dir/dada2_table_160_130.qzv

In [None]:
Visualization.load(f"{results_dir}/dada2_table_160_130.qzv")

### with trunc 135

In [None]:
! qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $data_in/sequences-demux-paired.qza \
    --p-trunc-len-f 135 \
    --p-trunc-len-r 135 \
    --p-n-threads 3 \
    --o-table $data_out/dada2_table_135.qza \
    --o-representative-sequences $data_out/dada2_rep_seq_135.qza \
    --o-denoising-stats $data_out/dada2_stats_135.qza

In [None]:
! qiime metadata tabulate \
    --m-input-file $data_out/dada2_stats_135.qza \
    --o-visualization $results_dir/dada2_stats_135.qzv

In [None]:
Visualization.load(f"{results_dir}/dada2_stats_135.qzv")

In [None]:
! qiime feature-table summarize \
    --i-table $data_out/dada2_table_135.qza \
    --m-sample-metadata-file $data_in/metadata_135.tsv \
    --o-visualization $results_dir/dada2_table_135.qzv

In [None]:
Visualization.load(f"{results_dir}/dada2_table_135.qzv")