# 1. Import Packages

In [1]:
# Importing all required packages at the start of the notebook
import os
import matplotlib.pyplot as plt
import pandas as pd
import qiime2 as q2
from qiime2 import Visualization
from qiime2 import Artifact
import seaborn as sns
from scipy.stats import shapiro, kruskal, f_oneway

# 2. Data Directionary

In [2]:
# Location
data_dir = "Project_data/Differential_Abundance"
! mkdir -p "$data_dir"

In [3]:
# Paths to project inputs
input_table    = "Project_data/Taxonomy/table_filtered.qza"
input_taxonomy = "Project_data/Taxonomy/taxonomy_pretrained.qza"
input_metadata = "Project_data/Metadata/updated_fungut_metadata.tsv"

# 3. Testing the normality of our data

In [19]:
data = q2.Artifact.load(input_table).view(pd.DataFrame)

In [20]:
alpha = 0.05
results = {}

# iterate through rows (samples) and test each of them for normality
for asv_name, asv_values in data.items():
    stat, p = shapiro(asv_values)
    results[asv_name] = p

# convert test results into a DataFrame
results_df = pd.DataFrame(data=results.values(), index=results.keys(), columns=['p'])

# add a new column with a descriptive test result
results_df['is_normal'] = results_df['p'] > alpha

In [21]:
print('Number of ASVs with normal distribution:', results_df['is_normal'].sum())

Number of ASVs with normal distribution: 0


Distribution of our ASVs is not normal (which was expected), so we will use ANCOM

# 3. Differential Abundance - IBD Status

## 3.1 Testing how filtering impacts the number of features we get

In [79]:
# First trying what we did in the course
! qiime feature-table filter-features \
  --i-table $input_table \
  --p-min-frequency 25 \
  --p-min-samples 4 \
  --o-filtered-table $data_dir/table_abund.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: Project_data/Differential_Abundance/table_abund.qza[0m
[0m[?25h

In [53]:
! qiime feature-table filter-features \
  --i-table $input_table \
  --p-min-samples 4 \
  --o-filtered-table $data_dir/table_abund_test2.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: Project_data/Differential_Abundance/table_abund_test2.qza[0m
[0m[?25h

In [54]:
! qiime feature-table filter-features \
  --i-table $input_table \
  --p-min-frequency 25 \
  --o-filtered-table $data_dir/table_abund_test3.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: Project_data/Differential_Abundance/table_abund_test3.qza[0m
[0m[?25h

In [55]:
! qiime feature-table filter-features \
  --i-table $input_table \
  --p-min-frequency 25 \
  --p-min-samples 3 \
  --o-filtered-table $data_dir/table_abund_test4.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: Project_data/Differential_Abundance/table_abund_test4.qza[0m
[0m[?25h

In [56]:
! qiime feature-table filter-features \
  --i-table $input_table \
  --p-min-frequency 25 \
  --p-min-samples 2 \
  --o-filtered-table $data_dir/table_abund_test5.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: Project_data/Differential_Abundance/table_abund_test5.qza[0m
[0m[?25h

In [80]:
#Get number of remaining features after filtering with the different parameters
table_abund_test1 = Artifact.load(f"{data_dir}/table_abund.qza").view(pd.DataFrame)
table_abund_test2 = Artifact.load(f"{data_dir}/table_abund_test2.qza").view(pd.DataFrame)
table_abund_test3 = Artifact.load(f"{data_dir}/table_abund_test3.qza").view(pd.DataFrame)
table_abund_test4 = Artifact.load(f"{data_dir}/table_abund_test4.qza").view(pd.DataFrame)
table_abund_test5 = Artifact.load(f"{data_dir}/table_abund_test5.qza").view(pd.DataFrame)

tests = [f"Test {i}" for i in range(1, 6)]
min_freq = [25, 0, 25, 25, 25]
min_sample = [4, 4, 0, 3, 2]
dfs = [table_abund_test1, table_abund_test2, table_abund_test3, table_abund_test4, table_abund_test5]

rem_features = []

for df in dfs:
    rem_features.append(len(df.columns))

comparison_df = pd.DataFrame({"Minimum frequency": min_freq, "Minimum sample": min_sample, "Number of features remaining": rem_features}, index=tests)

display(comparison_df)

Unnamed: 0,Minimum frequency,Minimum sample,Number of features remaining
Test 1,25,4,56
Test 2,0,4,59
Test 3,25,0,538
Test 4,25,3,74
Test 5,25,2,109


We have to use strict parameters for our differential abundance to make sense, so we will still use a minimum frequency of 25 and a minimum of samples of 4, even if this makes us to loose a substantial number of features.

# 4. Transforming ASVs to taxonomic units

In [81]:
# Collapse to species level (L7)
! qiime taxa collapse \
  --i-table $data_dir/table_abund.qza \
  --i-taxonomy $input_taxonomy \
  --p-level 7 \
  --o-collapsed-table $data_dir/table_abund_L7.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: Project_data/Differential_Abundance/table_abund_L7.qza[0m
[0m[?25h

# 5. Differential abundance analysis

## 5.1 IBD

In [82]:
# ANCOM-BC: effect of IBD
! qiime composition ancombc \
  --i-table $data_dir/table_abund_L7.qza \
  --m-metadata-file $input_metadata \
  --p-formula "ibd_sample" \
  --o-differentials $data_dir/ancombc_ibd_L7_diffs.qza

  import pkg_resources
[32mSaved FeatureData[DifferentialAbundance] to: Project_data/Differential_Abundance/ancombc_ibd_L7_diffs.qza[0m
[0m[?25h

In [83]:
# Barplot results
! qiime composition da-barplot \
  --i-data $data_dir/ancombc_ibd_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_ibd_L7_barplot.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_ibd_L7_barplot.qzv[0m
[0m[?25h

In [84]:
! qiime composition tabulate \
  --i-data $data_dir/ancombc_ibd_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_ibd_L7_results.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_ibd_L7_results.qzv[0m
[0m[?25h

In [85]:
Visualization.load("Project_data/Differential_Abundance/ancombc_ibd_L7_barplot.qzv")

In [86]:
Visualization.load("Project_data/Differential_Abundance/ancombc_ibd_L7_results.qzv")

## 5.2 Gluten Status

In [87]:
# because there were "/" in the gluten column
meta = pd.read_csv(input_metadata, sep="\t")

# create a cleaned version of the gluten column without '/'
meta["gluten_clean"] = meta["gluten_sample"].str.replace("/", "_", regex=False)

meta_clean_path = "Project_data/Differential_Abundance/metadata_gluten_clean.tsv"
meta.to_csv(meta_clean_path, sep="\t", index=False)

meta_clean_path

'Project_data/Differential_Abundance/metadata_gluten_clean.tsv'

In [88]:
# ANCOM-BC: effect of Gluten
! qiime composition ancombc \
  --i-table $data_dir/table_abund_L7.qza \
  --m-metadata-file Project_data/Differential_Abundance/metadata_gluten_clean.tsv \
  --p-formula "gluten_clean" \
  --o-differentials $data_dir/ancombc_gluten_L7_diffs.qza

  import pkg_resources
[32mSaved FeatureData[DifferentialAbundance] to: Project_data/Differential_Abundance/ancombc_gluten_L7_diffs.qza[0m
[0m[?25h

In [89]:
# Barplot results
! qiime composition da-barplot \
  --i-data $data_dir/ancombc_gluten_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_gluten_L7_barplot.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_gluten_L7_barplot.qzv[0m
[0m[?25h

In [90]:
! qiime composition tabulate \
  --i-data $data_dir/ancombc_gluten_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_gluten_L7_results.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_gluten_L7_results.qzv[0m
[0m[?25h

In [91]:
Visualization.load("Project_data/Differential_Abundance/ancombc_gluten_L7_barplot.qzv")

In [92]:
Visualization.load("Project_data/Differential_Abundance/ancombc_gluten_L7_results.qzv")

## 5.3 Diet

In [93]:
# because there were "/" in the gluten column
meta = pd.read_csv(input_metadata, sep="\t")
meta["diet_type_sample"].unique()

meta["diet_clean"] = (
    meta["diet_type_sample"]
    .str.replace("/", "_", regex=False)
    .str.replace(" ", "_", regex=False)
)
clean_meta_path = "Project_data/Differential_Abundance/metadata_diet_clean.tsv"
meta.to_csv(clean_meta_path, sep="\t", index=False)

clean_meta_path

'Project_data/Differential_Abundance/metadata_diet_clean.tsv'

In [94]:
# ANCOM-BC: effect of Diet
! qiime composition ancombc \
  --i-table $data_dir/table_abund_L7.qza \
  --m-metadata-file Project_data/Differential_Abundance/metadata_diet_clean.tsv \
  --p-formula "diet_clean" \
  --o-differentials $data_dir/ancombc_diet_L7_diffs.qza

  import pkg_resources
[32mSaved FeatureData[DifferentialAbundance] to: Project_data/Differential_Abundance/ancombc_diet_L7_diffs.qza[0m
[0m[?25h

In [95]:
# Barplot results
! qiime composition da-barplot \
  --i-data $data_dir/ancombc_diet_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_diet_L7_barplot.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_diet_L7_barplot.qzv[0m
[0m[?25h

In [96]:
! qiime composition tabulate \
  --i-data $data_dir/ancombc_diet_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_diet_L7_results.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_diet_L7_results.qzv[0m
[0m[?25h

In [97]:
Visualization.load("Project_data/Differential_Abundance/ancombc_diet_L7_barplot.qzv")

In [98]:
Visualization.load("Project_data/Differential_Abundance/ancombc_diet_L7_results.qzv")

## 5.4 Gender

In [99]:
# ANCOM-BC: effect of Sex
! qiime composition ancombc \
  --i-table $data_dir/table_abund_L7.qza \
  --m-metadata-file $input_metadata \
  --p-formula "sex_sample" \
  --o-differentials $data_dir/ancombc_sex_L7_diffs.qza

  import pkg_resources
[32mSaved FeatureData[DifferentialAbundance] to: Project_data/Differential_Abundance/ancombc_sex_L7_diffs.qza[0m
[0m[?25h

In [100]:
# Barplot results
! qiime composition da-barplot \
  --i-data $data_dir/ancombc_sex_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_sex_L7_barplot.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_sex_L7_barplot.qzv[0m
[0m[?25h

In [101]:
! qiime composition tabulate \
  --i-data $data_dir/ancombc_sex_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_sex_L7_results.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_sex_L7_results.qzv[0m
[0m[?25h

In [102]:
Visualization.load("Project_data/Differential_Abundance/ancombc_sex_L7_barplot.qzv")

In [103]:
Visualization.load("Project_data/Differential_Abundance/ancombc_sex_L7_results.qzv")

## 5.5 BMI

In [104]:
# ANCOM-BC: effect of BMI
! qiime composition ancombc \
  --i-table $data_dir/table_abund_L7.qza \
  --m-metadata-file $input_metadata \
  --p-formula "bmi_category" \
  --o-differentials $data_dir/ancombc_bmi_L7_diffs.qza

  import pkg_resources
[32mSaved FeatureData[DifferentialAbundance] to: Project_data/Differential_Abundance/ancombc_bmi_L7_diffs.qza[0m
[0m[?25h

In [105]:
# Barplot results
! qiime composition da-barplot \
  --i-data $data_dir/ancombc_bmi_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_bmi_L7_barplot.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_bmi_L7_barplot.qzv[0m
[0m[?25h

In [106]:
! qiime composition tabulate \
  --i-data $data_dir/ancombc_bmi_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_bmi_L7_results.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_bmi_L7_results.qzv[0m
[0m[?25h

In [107]:
Visualization.load("Project_data/Differential_Abundance/ancombc_bmi_L7_barplot.qzv")

In [108]:
Visualization.load("Project_data/Differential_Abundance/ancombc_bmi_L7_results.qzv")

## 5.6 Continent

In [109]:
# ANCOM-BC: effect of the continent
! qiime composition ancombc \
  --i-table $data_dir/table_abund_L7.qza \
  --m-metadata-file $input_metadata \
  --p-formula "continent" \
  --o-differentials $data_dir/ancombc_continent_L7_diffs.qza

  import pkg_resources
[32mSaved FeatureData[DifferentialAbundance] to: Project_data/Differential_Abundance/ancombc_continent_L7_diffs.qza[0m
[0m[?25h

In [110]:
# Barplot results
! qiime composition da-barplot \
  --i-data $data_dir/ancombc_continent_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_continent_L7_barplot.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_continent_L7_barplot.qzv[0m
[0m[?25h

In [111]:
! qiime composition tabulate \
  --i-data $data_dir/ancombc_continent_L7_diffs.qza \
  --o-visualization $data_dir/ancombc_continent_L7_results.qzv

  import pkg_resources
[32mSaved Visualization to: Project_data/Differential_Abundance/ancombc_continent_L7_results.qzv[0m
[0m[?25h

In [6]:
Visualization.load("Project_data/Differential_Abundance/ancombc_continent_L7_barplot.qzv")

In [7]:
Visualization.load("Project_data/Differential_Abundance/ancombc_continent_L7_results.qzv")