# 2. Quality Control and Denoising
## Import data & packages

In [1]:
# 1 - Import all packages
import IPython
import pandas as pd
import matplotlib.pyplot as plt
import os
import qiime2 as q2
from qiime2 import Visualization

%matplotlib inline

In [12]:
# 2 - Set working directory
os.chdir("/home/jovyan/Project/MicrobiomeAnalysis_TummyTribe")

# Verify that your wroking directory is the overall project folder (.../MicrobiomeAnalysis_TummyTribe)
print("Current working directory:", os.getcwd())

Current working directory: /home/jovyan/Project/MicrobiomeAnalysis_TummyTribe


In [13]:
# 3 - Data directory for the raw data
data_dir = "data/raw"
processed_data_dir = "data/processed"

## Quality Control

In [14]:
! qiime tools peek $data_dir/sequences-demux-paired.qza

Usage: [94mqiime tools peek[0m [OPTIONS] ARTIFACT/VISUALIZATION

  Display basic information about a QIIME 2 Artifact or Visualization,
  including its UUID and type.

[1mOptions[0m:
  [94m--tsv[0m / [94m--no-tsv[0m  Print as machine-readable tab-separated values.
  [94m--help[0m            Show this message and exit.

[33m                    There was a problem with the command:                     [0m
[31m[1m (1/1) Invalid value for 'ARTIFACT/VISUALIZATION': File 'data/raw/sequences-
  demux-paired.qza' does not exist.[0m


In [21]:
! qiime demux summarize \
    --i-data $data_dir/sequences-demux-paired.qza \
    --o-visualization $data_dir/sequences-demux-paired.qzv

  import pkg_resources
[32mSaved Visualization to: data/raw/sequences-demux-paired.qzv[0m
[0m[?25h

In [8]:
Visualization.load(f"{data_dir}/sequences-demux-paired.qzv")

## Denoising and merging

Parameters
- `p-trunc-len` - we will truncate the reads to 130 bp (sequences shorter than this will be removed automatically)
- `p-n-threads` - if we have more than 1 CPU available, we can specify the number here to make the processing faster
- `o-table` - this will be our ASVs feature table
- `o-representative-sequences` - this will be a list of all the denoised features (DNA sequences)
- `o-denoising-stats` - this will be some stats from the denoising process

Information on parameters and function: https://docs.qiime2.org/2024.10/plugins/available/dada2/denoise-paired/
Example tutorial of paired read analysis: https://docs.qiime2.org/2024.10/tutorials/atacama-soils/

In [15]:
# this cell takes a loooong time to run. Time for coffee? 
# Or a cool video? https://www.youtube.com/watch?v=-z4gNr7mN3U
# Or a pull-up!
# Nvm that last one, too difficult

! qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $data_dir/sequences-demux-paired.qza \
    --p-trunc-len-f 175 \
    --p-trunc-len-r 155 \
    --p-n-threads 3 \
    --o-table $processed_data_dir/dada2_table.qza \
    --o-representative-sequences $processed_data_dir/dada2_rep_set.qza \
    --o-denoising-stats $processed_data_dir/dada2_stats.qza
    --verbose

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: data/processed/dada2_table.qza[0m
[32mSaved FeatureData[Sequence] to: data/processed/dada2_rep_set.qza[0m
[32mSaved SampleData[DADA2Stats] to: data/processed/dada2_stats.qza[0m
[0m[?25h

In [16]:
! qiime metadata tabulate \
    --m-input-file $processed_data_dir/dada2_stats.qza \
    --o-visualization $processed_data_dir/dada2_stats.qzv

  import pkg_resources
[32mSaved Visualization to: data/processed/dada2_stats.qzv[0m
[0m[?25h

In [17]:
Visualization.load(f"{processed_data_dir}/dada2_stats.qzv")

In [18]:
! qiime feature-table summarize \
    --i-table $processed_data_dir/dada2_table.qza \
    --m-sample-metadata-file $data_dir/metadata.tsv \
    --o-visualization $processed_data_dir/dada2_table.qzv

  import pkg_resources
[32mSaved Visualization to: data/processed/dada2_table.qzv[0m
[0m[?25h

In [19]:
Visualization.load(f"{processed_data_dir}/dada2_table.qzv")