# 2. Quality Control and Denoising
## Import data & packages

In [23]:
# 1 - Import all packages
import IPython
import pandas as pd
import matplotlib.pyplot as plt
import os
import qiime2 as q2
from qiime2 import Visualization

%matplotlib inline

In [11]:
# 2 - Set working directory
os.chdir("/home/jovyan/MicrobiomeAnalysis_TummyTribe/")

# Verify that your wroking directory is the overall project folder (.../MicrobiomeAnalysis_TummyTribe)
print("Current working directory:", os.getcwd())

Current working directory: /home/jovyan/MicrobiomeAnalysis_TummyTribe


In [12]:
# 3 - Data directory for the raw data
data_dir = "data/raw"
processed_data_dir = "data/processed"

In [13]:
metadata_df = pd.read_csv(f'{data_dir}/metadata.tsv', sep='\t', index_col=0)

In [14]:
metadata_df.head()

Unnamed: 0_level_0,host_id,age_months,geo_location_name,delivery_mode,sex,diet_weaning,diet_milk,treatment_exposure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SRR8118533,E000823,4.0,Finland,vaginal,male,no,bd,False
SRR8118537,E000823,7.0,Finland,vaginal,male,yes,mixed,False
SRR8118564,E001958,4.0,Finland,vaginal,female,yes,bd,False
SRR8118650,E001958,7.0,Finland,vaginal,female,yes,mixed,False
SRR8118652,E001958,10.0,Finland,vaginal,female,yes,mixed,False


## Quality Control

In [20]:
! qiime tools peek $data_dir/sequences-demux-paired.qza

[32mUUID[0m:        b4782ab7-550b-41f5-b906-ca2cda29ca9b
[32mType[0m:        SampleData[PairedEndSequencesWithQuality]
[32mData format[0m: SingleLanePerSamplePairedEndFastqDirFmt


In [21]:
! qiime demux summarize \
    --i-data $data_dir/sequences-demux-paired.qza \
    --o-visualization $data_dir/sequences-demux-paired.qzv

  import pkg_resources
[32mSaved Visualization to: data/raw/sequences-demux-paired.qzv[0m
[0m[?25h

In [24]:
Visualization.load(f"{data_dir}/sequences-demux-paired.qzv")

## Denoising

Parameters
- `p-trunc-len` - we will truncate the reads to 130 bp (sequences shorter than this will be removed automatically)
- `p-n-threads` - if we have more than 1 CPU available, we can specify the number here to make the processing faster
- `o-table` - this will be our ASVs feature table
- `o-representative-sequences` - this will be a list of all the denoised features (DNA sequences)
- `o-denoising-stats` - this will be some stats from the denoising process

Information on function: https://docs.qiime2.org/2024.10/plugins/available/dada2/denoise-paired/

In [None]:
! qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $data_dir/sequences-demux-paired.qza \
    --p-trunc-len 130 \
    --p-n-threads 3 \
    --o-table $processed_data_dir/dada2_table.qza \
    --o-representative-sequences $processed_data_dir/dada2_rep_set.qza \
    --o-denoising-stats $processed_data_dir/dada2_stats.qza