# Diversity Analysis

In this notebook we'll use the merged table constructed in merged_table.ipynb to investigate alpha and beta diversity of samples in the meta-analysis, using QIIME2 and SkBio. 
_______


In [None]:
from utils import *
import pandas as pd
import qiime2 as q2
from plotnine import *
import skbio
import scipy.stats
import numpy as np
import os

%matplotlib inline

## Merged Table
First, collect the merged table with all reads

In [None]:
merged_table = pd.read_csv('../data/merged_table.csv', index_col = 0)
merged_table

## Convert to QIIME2 Artifact
We'll pivot this table, and save it as a QIIME2 artifact, so we can use the built-in diversity functions. 

In [None]:
# pivot into matrix form
abundance_table = pd.pivot_table(merged_table, index = 'sample_id', 
                         columns = 'full_taxonomy', values = 'reads').fillna(0)

# convert to QIIME2 artifact
abundance_table = q2.Artifact.import_data("FeatureTable[Frequency]", abundance_table)
abundance_table.save('../data/abundance_table.qza')

## Metadata
We'll also need a merged metadata table, constructed from study specific metadata. We can get this from the merged table loaded above.

In [None]:
# isolate metadata columns
metadata = merged_table[['sample_id','condition','region', 'country','hypervariable_region',
                         'sequencing','study','age','sex', 'disease','URT']]
# rename column for use in QIIME2
metadata.rename(columns = {'sample_id':'sample-id'}, inplace = True)

# drop duplicates so there is one row per samples
metadata.drop_duplicates(inplace = True)

metadata.set_index('sample-id', inplace = True)
metadata.to_csv('../data/merged_metadata.tsv',sep = '\t')

## Rarefy Data
First, we need to rarefy our data. We can use the QIIME2 alpha-rarefaction function to visualize rarefaction, then rarefy to an appropriate depth.

In [None]:
!qiime diversity alpha-rarefaction \
  --i-table ../data/abundance_table.qza \
  --p-max-depth 10000 \
  --m-metadata-file ../data/merged_metadata.tsv \
  --o-visualization ../visualizations/alpha-rarefaction.qzv

In [None]:
!qiime feature-table rarefy \
  --i-table ../data/abundance_table.qza \
  --p-sampling-depth 2000 \
  --o-rarefied-table ../results/rarefied_table.qza

# Alpha Diversity
Now we'll examine alpha diversity using Shannon entropy, and richness using Chao1 index

In [None]:
!qiime diversity alpha \
  --i-table ../results/rarefied_table.qza \
  --p-metric shannon \
  --o-alpha-diversity ../results/shannon.qza
!qiime tools export \
  --input-path ../results/shannon.qza \
  --output-path ../results/shannon

In [None]:
!qiime diversity alpha \
  --i-table ../results/rarefied_table.qza \
  --p-metric chao1 \
  --o-alpha-diversity ../results/chao1.qza
!qiime tools export \
  --input-path ../results/chao1.qza \
  --output-path ../results/chao1

## Add Metadata 
Merged the results, and add metadata to the alpha diversity results

In [None]:
# get shannon results
alpha = pd.concat([pd.read_csv('../results/shannon/alpha-diversity.tsv', sep = '\t', index_col = 0),
                   pd.read_csv('../results/chao1/alpha-diversity.tsv', sep = '\t', index_col = 0)], axis = 1)

# add metadata, drop filtered samples
alpha = pd.concat([alpha, metadata],axis = 1, join = 'inner')

# filter nasopharynx and oropharynx samples
alpha_NP = alpha[alpha['URT'] == 'NP']
alpha_OP = alpha[alpha['URT'] == 'OP']
alpha

## Color Dict
Define a color dictionary for disease types

In [None]:
color_dict = {'Asthma':'#a6cee3',
              'COVID-19':'#1f78b4', 
              'Influenza':'#b2df8a',
              'Pneumonia':'#33a02c',
              'RSV':'#fb9a99',
              'RTI':'#e31a1c',
              'Resp. Allergies':'#fdbf6f',
              'Rhinosinusitis':'#ff7f00',
              'COPD':'#cab2d6',
              'Tonsillitis':'#6a3d9a',
              'Healthy':'#a2acbb'}

## Visualize Results
Make bar plots for Shannon  entropy and Chao1 index in both nasopharynx and oropharynx samples

In [None]:
shannonNP_plt = (ggplot(
    alpha_NP, aes(x = 'study', y = 'shannon_entropy'))
    +scale_x_discrete(limits = alpha_NP['study'].unique()) 
    +annotate(geom_rect, xmin=0, xmax=3.5, ymin=0, ymax=float('inf'),
              fill = '#a6cee3', alpha=0.3)
    +annotate(geom_rect, xmin=3.5, xmax=6.5, ymin=0, ymax=float('inf'),
              fill = '#1f78b4', alpha=0.3)
    +annotate(geom_rect, xmin=6.5, xmax=9.5, ymin=0, ymax=float('inf'),
              fill = '#b2df8a', alpha=0.3)
    +annotate(geom_rect, xmin=9.5, xmax=10.5, ymin=0, ymax=float('inf'),
              fill = '#33a02c', alpha=0.3)
    +annotate(geom_rect, xmin=10.5, xmax=13.5, ymin=0, ymax=float('inf'),
              fill = '#fb9a99', alpha=0.3)
    +annotate(geom_rect, xmin=13.5, xmax=16.5, ymin=0, ymax=float('inf'),
              fill = '#e31a1c', alpha=0.3)
    +annotate(geom_rect, xmin=16.5, xmax=17.5, ymin=0, ymax=float('inf'),
              fill = '#fdbf6f', alpha=0.3)
    +annotate(geom_rect, xmin=17.5, xmax=20.5, ymin=0, ymax=float('inf'),
              fill = '#ff7f00', alpha=0.3)
    +ylim(0,7)
    +geom_boxplot(aes(fill = 'condition'),width = 1, alpha = 0.8)
    +theme(axis_text_x=element_text(rotation=40, hjust=1))
    +scale_fill_manual(values = ['#c3553b','#3e8093'], labels = ['Case','Control'])
    +labs(x = 'Study', y = 'Shannon Entropy', fill = 'Status')
    +theme(text = element_text(size=30),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right',
                                figure_size=(16, 12)))
shannonNP_plt

In [None]:
chaoNP_plt = (ggplot(
    alpha_NP, aes(x = 'study', y = 'chao1'))
    +scale_x_discrete(limits = alpha_NP['study'].unique()) 
    +annotate(geom_rect, xmin=0, xmax=3.5, ymin=0, ymax=float('inf'),
              fill = '#a6cee3', alpha=0.3)
    +annotate(geom_rect, xmin=3.5, xmax=6.5, ymin=0, ymax=float('inf'),
              fill = '#1f78b4', alpha=0.3)
    +annotate(geom_rect, xmin=6.5, xmax=9.5, ymin=0, ymax=float('inf'),
              fill = '#b2df8a', alpha=0.3)
    +annotate(geom_rect, xmin=9.5, xmax=10.5, ymin=0, ymax=float('inf'),
              fill = '#33a02c', alpha=0.3)
    +annotate(geom_rect, xmin=10.5, xmax=13.5, ymin=0, ymax=float('inf'),
              fill = '#fb9a99', alpha=0.3)
    +annotate(geom_rect, xmin=13.5, xmax=16.5, ymin=0, ymax=float('inf'),
              fill = '#e31a1c', alpha=0.3)
    +annotate(geom_rect, xmin=16.5, xmax=17.5, ymin=0, ymax=float('inf'),
              fill = '#fdbf6f', alpha=0.3)
    +annotate(geom_rect, xmin=17.5, xmax=20.5, ymin=0, ymax=float('inf'),
              fill = '#ff7f00', alpha=0.3)

    +geom_boxplot(aes(fill = 'condition'),width = 1, alpha = 0.8)
    +theme(axis_text_x=element_text(rotation=40, hjust=1))
    +scale_fill_manual(values = ['#c3553b','#3e8093'], labels = ['Case','Control'])
    +labs(x = 'Study', y = 'Chao1 Index', fill = 'Status')
    +theme(text = element_text(size=30),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right',
                                figure_size=(16, 12)))
chaoNP_plt

In [None]:
shannonOP_plt = (ggplot(
    alpha_OP, aes(x = 'study', y = 'shannon_entropy'))
    +scale_x_discrete(limits = alpha_OP['study'].unique()) 
    +annotate(geom_rect, xmin=0, xmax=1.5, ymin=0, ymax=float('inf'),
              fill = '#a6cee3', alpha=0.3)
    +annotate(geom_rect, xmin=1.5, xmax=2.5, ymin=0, ymax=float('inf'),
              fill = '#cab2d6', alpha=0.3)
    +annotate(geom_rect, xmin=2.5, xmax=3.5, ymin=0, ymax=float('inf'),
              fill = '#b2df8a', alpha=0.3)
    +annotate(geom_rect, xmin=3.5, xmax=6.5, ymin=0, ymax=float('inf'),
              fill = '#33a02c', alpha=0.3)
    +annotate(geom_rect, xmin=6.5, xmax=7.5, ymin=0, ymax=float('inf'),
              fill = '#e31a1c', alpha=0.3)
    +annotate(geom_rect, xmin=7.5, xmax=8.5, ymin=0, ymax=float('inf'),
              fill = '#fdbf6f', alpha=0.3)
    +annotate(geom_rect, xmin=8.5, xmax=10.5, ymin=0, ymax=float('inf'),
              fill = '#6a3d9a', alpha=0.3)
    +geom_boxplot(aes(fill = 'condition'), width = 0.66, alpha = 0.8)
    +ylim(0,7)
    +theme(axis_text_x=element_text(rotation=40, hjust=1))
    +scale_fill_manual(values = ['#c3553b','#3e8093'], labels = ['Case','Control'])
    +labs(x = 'Study', y = 'Shannon Entropy', fill = 'Status')
    +theme(text = element_text(size=30),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right',
                                figure_size=(16, 12)))
shannonOP_plt

In [None]:
chaoOP_plt = (ggplot(
    alpha_OP, aes(x = 'study', y = 'chao1'))
    +scale_x_discrete(limits = alpha_OP['study'].unique()) 
    +annotate(geom_rect, xmin=0, xmax=1.5, ymin=0, ymax=float('inf'),
              fill = '#a6cee3', alpha=0.3)
    +annotate(geom_rect, xmin=1.5, xmax=2.5, ymin=0, ymax=float('inf'),
              fill = '#cab2d6', alpha=0.3)
    +annotate(geom_rect, xmin=2.5, xmax=3.5, ymin=0, ymax=float('inf'),
              fill = '#b2df8a', alpha=0.3)
    +annotate(geom_rect, xmin=3.5, xmax=6.5, ymin=0, ymax=float('inf'),
              fill = '#33a02c', alpha=0.3)
    +annotate(geom_rect, xmin=6.5, xmax=7.5, ymin=0, ymax=float('inf'),
              fill = '#e31a1c', alpha=0.3)
    +annotate(geom_rect, xmin=7.5, xmax=8.5, ymin=0, ymax=float('inf'),
              fill = '#fdbf6f', alpha=0.3)
    +annotate(geom_rect, xmin=8.5, xmax=10.5, ymin=0, ymax=float('inf'),
              fill = '#6a3d9a', alpha=0.3)
    +geom_boxplot(aes(fill = 'condition'),width = 0.66, alpha = 0.8)
    +theme(axis_text_x=element_text(rotation=40, hjust=1))
    +scale_fill_manual(values = ['#c3553b','#3e8093'], labels = ['Case','Control'])
    +labs(x = 'Study', y = 'Chao1 Index', fill = 'Status')
    +theme(text = element_text(size=30),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right',
                                figure_size=(16, 12)))
chaoOP_plt

In [None]:
ggsave(shannonNP_plt, '../visualizations/shannonNP.svg', dpi = 300, format = 'svg')
ggsave(chaoNP_plt, '../visualizations/chaoNP.svg', dpi = 300, format = 'svg')
ggsave(shannonOP_plt, '../visualizations/shannonOP.svg', dpi = 300, format = 'svg')
ggsave(chaoOP_plt, '../visualizations/chaoOP.svg', dpi = 300, format = 'svg')

## Calculate Significance
Using a t-test, determine the association with diversity and case/control condition for each plot. 

In [None]:
# iterate through studies 
for study in alpha['study'].unique():
    # identify cases and controls
    case = alpha['shannon_entropy'][(alpha['study'].str.contains(study))&
                                      (alpha['condition'].str.contains('case'))]
    control = alpha['shannon_entropy'][(alpha['study'].str.contains(study))&
                                      (alpha['condition'].str.contains('control'))]
    # calculate a pvalue 
    pval = scipy.stats.f_oneway(case,control)[1]
    print(study+': '+pval.round(50).astype('str'))

In [None]:
# iterate through studies
for study in alpha['study'].unique():
    # identify cases and controls
    case = alpha['chao1'][(alpha['study'].str.contains(study))&
                                      (alpha['condition'].str.contains('case'))]
    control = alpha['chao1'][(alpha['study'].str.contains(study))&
                                      (alpha['condition'].str.contains('control'))]
    # calculate a pvalue
    pval = scipy.stats.f_oneway(case,control)[1]
    print(study+': '+pval.round(50).astype('str'))

# Beta Diversity
Now we'll look at beta diversity among samples, by calculating Bray-Curtis dissimilarity in QIIME2

In [None]:
!qiime diversity beta \
    --i-table ../results/rarefied_table.qza \
    --p-metric braycurtis \
    --o-distance-matrix ../results/braycurtis.qza
!qiime tools export \
  --input-path ../results/braycurtis.qza \
  --output-path ../results/braycurtis

## Calculate PCoA
We'll read in our dissimilarity matrix, and conduct a PCoA in SkBio 

In [None]:
# read the dissimilarity matrix
dm = pd.read_csv('../results/braycurtis/distance-matrix.tsv', sep = '\t',index_col = 0)

# conduct PCoA
od_res = skbio.stats.ordination.pcoa(dm)

# examine proportion explained
od_res.proportion_explained

## Add Metadata
Now we can merge our PCoA results with corresponding metadata

In [None]:
# replace index with sample ids
od_res.samples.index = dm.index

# concatenate with metadata
pcoa_results = pd.concat([od_res.samples, metadata], axis = 1, join = 'inner')

# replace disease with health in control samples
pcoa_results.loc[pcoa_results['condition'] == 'control', 'disease'] = 'Healthy'


## Plot Beta Diversity
Create plots for beta diversity, encoding metadata values as colors. 

In [None]:
plt_condition = (ggplot(
    pcoa_results, aes(x = 'PC1', y = 'PC2', color = 'condition'))
    +geom_point(size = 3)
    +labs(x = 'PC1 (20.29%)', y = 'PC2 (10.07%)', color = 'Condition')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',
                                figure_size=(8, 6)))
plt_condition

In [None]:
plt_URT = (ggplot(
    pcoa_results, aes(x = 'PC1', y = 'PC2', color = 'URT'))
    +geom_point(size = 3)
    +labs(x = 'PC1 (20.29%)', y = 'PC2 (10.07%)', color = 'URT Area')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',
                                figure_size=(8, 6)))
plt_URT

In [None]:
plt_region = (ggplot(
    pcoa_results, aes(x = 'PC1', y = 'PC2', color = 'region'))
    +geom_point(size = 3)
    +labs(x = 'PC1 (20.29%)', y = 'PC2 (10.07%)', color = 'Geographic Region')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',
                                figure_size=(8, 6)))
plt_region

In [None]:
plt_disease = (ggplot(
    pcoa_results, aes(x = 'PC1', y = 'PC2', color = 'disease'))
    +geom_point(size = 3)
    +labs(x = 'PC1 (20.29%)', y = 'PC2 (10.07%)', color = 'Disease')
    +scale_color_manual(values = color_dict)
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',
                                figure_size=(8, 6)))
plt_disease

In [None]:
plt_sequence = (ggplot(
    pcoa_results, aes(x = 'PC1', y = 'PC2', color = 'sequencing'))
    +geom_point(size = 3)
    +labs(x = 'PC1 (20.29%)', y = 'PC2 (10.07%)', color = 'Sequencing Method')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',
                                figure_size=(8, 6)))
plt_sequence

In [None]:
plt_16S = (ggplot(
    pcoa_results, aes(x = 'PC1', y = 'PC2', color = '16S'))
    +geom_point(size = 3)
    +labs(x = 'PC1 (20.29%)', y = 'PC2 (10.07%)', color = 'Hypervariable Region')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',
                                figure_size=(8, 6)))
plt_16S

In [None]:
ggsave(plt_condition, '../visualizations/beta_condition.svg', dpi = 300, format = 'svg')
ggsave(plt_URT, '../visualizations/beta_urt.svg', dpi = 300, format = 'svg')
ggsave(plt_region, '../visualizations/beta_region.svg', dpi = 300, format = 'svg')
ggsave(plt_disease, '../visualizations/beta_disease.svg', dpi = 300, format = 'svg')
ggsave(plt_sequence, '../visualizations/beta_sequence.svg', dpi = 300, format = 'svg')
ggsave(plt_16S, '../visualizations/beta_16S.svg', dpi = 300, format = 'svg')

## PERMANOVA
Use PERMANOVA to calculate significance

In [None]:
dm = pd.read_csv('../results/braycurtis/distance-matrix.tsv', sep = '\t',index_col = 0)
dm = skbio.stats.distance.DistanceMatrix(dm)
metadata_array= np.ascontiguousarray(pcoa_results['condition'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)

In [None]:
metadata_array= np.ascontiguousarray(pcoa_results['URT'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)

In [None]:
metadata_array= np.ascontiguousarray(pcoa_results['region'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)

In [None]:
metadata_array= np.ascontiguousarray(pcoa_results['disease'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)

In [None]:
metadata_array= np.ascontiguousarray(pcoa_results['sequencing'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)

In [None]:
metadata_array= np.ascontiguousarray(pcoa_results['16S'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)