# Diversity Analysis

In this notebook we'll use the output of our QIIME2 workflow to investigate alpha and beta diversity of samples in the meta-analysis. 

In [None]:
from utils import *
import pandas as pd
import qiime2 as q2
from plotnine import *
import skbio
import scipy.stats
import numpy as np
import os

%matplotlib inline

## Merged Table
First, we'll need to build a QIIME2 table that combines all our samples. We could use the QIIME2 merge tool, but as we have duplicated study IDs across URT sampling sites, we'll instead build a table ourselves. 

In [None]:
collapse_on=["kingdom", "phylum", "class", "order", "family", "genus"]
manifest_NP = pd.read_csv('/proj/gibbons/nbohmann/metabug/manifest/NP/NP_manifest.csv',index_col= 0, header = None)
res = pd.DataFrame()
for file_name in manifest_NP.index:
    #pull out feature tables with total reads
    ab = qiime_to_dataframe(feature_table="qiime/"+file_name+"_table.qza",
                        taxonomy="qiime/"+file_name+"_taxonomy.qza", 
                        collapse_on=collapse_on) 
    #merge with metadata
    meta = pd.read_csv('metadata/'+file_name+'_metadata.tsv', sep="\t")
    meta.rename(columns={meta.columns[0]: "sample_id"}, inplace=True)
    ab = pd.merge(ab, meta, on="sample_id")
    ab['URT'] = 'NP'
    ab['study'] = file_name
    #concatenate dataframes together
    res = pd.concat([res,ab])
manifest_OP = pd.read_csv('/proj/gibbons/nbohmann/metabug/manifest/OP/OP_manifest.csv',index_col= 0, header = None)
for file_name in manifest_OP.index:
    #pull out feature tables with total reads
    ab = qiime_to_dataframe(feature_table="qiime/"+file_name+"_table.qza",
                        taxonomy="qiime/"+file_name+"_taxonomy.qza", 
                        collapse_on=collapse_on) 
    #merge with metadata
    meta = pd.read_csv('metadata/'+file_name+'_metadata.tsv', sep="\t")
    meta.rename(columns={meta.columns[0]: "sample_id"}, inplace=True)
    ab = pd.merge(ab, meta, on="sample_id")
    ab['URT'] = 'OP'
    ab['study'] = file_name
    #concatenate dataframes together
    res = pd.concat([res,ab])
res = res.dropna(subset = ['genus']) #filter data 
res = res[~(res.genus.str.contains('None'))&~(res.genus.str.contains('uncultured'))&
          ~(res.genus.str.contains('Chloroplast'))&~(res.family.str.contains('Mitochondria'))]
res = clr(filter_taxa(res, min_reads=2, min_prevalence=0.05)) #center-log-ratio transformation and filtering
conditions = pd.read_csv('/proj/gibbons/nbohmann/metabug/conditions.csv',index_col = 0,header = None)[1].to_dict()
res['disease'] = res['study'].map(conditions)
res.head()

We'll pivot this table, and save it as a QIIME2 artifact. 

In [None]:
res_pvt = pd.pivot_table(res, index = 'full_taxonomy', 
                         columns = 'sample_id', values = 'reads').fillna(0)
res_pvt = res_pvt.T
abundance_artifact = q2.Artifact.import_data("FeatureTable[Frequency]", res_pvt)
abundance_artifact.save('/proj/gibbons/nbohmann/metabug/manifest/merged_table.qza')

## Metadata
We'll also need a merged metadata table, constructed from study specific metadata

In [None]:
meta_list_NP = os.listdir('/proj/gibbons/nbohmann/metabug/manifest/NP/metadata')
meta_list_OP = os.listdir('/proj/gibbons/nbohmann/metabug/manifest/OP/metadata')
metadata = pd.DataFrame()
for file in meta_list_NP: 
    df = pd.read_csv('/proj/gibbons/nbohmann/metabug/manifest/NP/metadata/'+file, sep = '\t')
    df['study'] = file.split('_')[0]
    metadata = pd.concat([metadata, df])
for file in meta_list_OP: 
    df = pd.read_csv('/proj/gibbons/nbohmann/metabug/manifest/OP/metadata'+file, sep = '\t')
    df['study'] = file.split('_')[0]
    metadata = pd.concat([metadata, df])
metadata = metadata.drop_duplicates(subset = 'id')
metadata = metadata[metadata.id.isin(res.sample_id.unique())]
metadata.set_index('id', inplace = True)
metadata['condition'] = metadata['condition'].str.replace('non-control','control')
conditions = pd.read_csv('/proj/gibbons/nbohmann/metabug/conditions.csv',
                         index_col = 0,header = None)[1].to_dict()
metadata['disease'] = metadata['study'].map(conditions)
metadata['URT']  = metadata.index.map(res.set_index('sample_id')['URT'].to_dict())
metadata.to_csv('/proj/gibbons/nbohmann/metabug/manifest/merged_metadata.tsv', sep = '\t')

## Rarefy Data
First, we need to rarefy our data. We can use the QIIME2 alpha-rarefaction function to visualize rarefaction, then rarefy to an appropriate depth.

In [None]:
!qiime diversity alpha-rarefaction \
  --i-table merged_table.qza \
  --p-max-depth 10000 \
  --m-metadata-file merged_metadata.tsv \
  --o-visualization alpha-rarefaction.qzv

In [None]:
!qiime feature-table rarefy \
  --i-table merged_table.qza \
  --p-sampling-depth 2000 \
  --o-rarefied-table rarefied_table.qza

## Alpha Diversity

Now we'll examine alpha diversity using Shannon entropy, then export the result

In [None]:
!qiime diversity alpha \
  --i-table rarefied_table.qza \
  --p-metric shannon \
  --o-alpha-diversity shannon.qza

In [None]:
!qiime tools export \
  --input-path shannon.qza \
  --output-path shannon

## Add Metadata 
Add the metadata to the alpha diversity results

In [None]:
os.chdir('/proj/gibbons/nbohmann/metabug/manifest/')
shannon = pd.read_csv('shannon/alpha-diversity.tsv', sep = '\t', index_col = 0)
shannon.index = shannon.index.rename('id')
shannon['region'] = shannon.index.map(metadata['region'].to_dict())
shannon['disease'] = shannon.index.map(metadata['disease'].to_dict())
shannon['condition'] = shannon.index.map(metadata['condition'].to_dict())
shannon['disease-condition'] = shannon['disease']+'-'+shannon['condition']
shannon['shannon_entropy'] = shannon['shannon_entropy'].astype('float')
shannon['URT'] = shannon.index.map(metadata['URT'].to_dict()).str.replace('NP','Nasopharynx').str.replace('OP','Oropharynx')
shannon

## Visualize Alpha Diversity

In [None]:
shannon_plt = (ggplot(
    shannon, aes(x = 'disease-condition', y = 'shannon_entropy'))
    +geom_boxplot(aes(fill = 'condition'))
    +theme(axis_text_x=element_text(rotation=60, hjust=1))
    +scale_fill_manual(values = ['#c3553b','#3e8093'], labels = ['Case','Control'])
    +labs(x = 'Disease + Condition', y = 'Shannon Entropy', fill = 'Status')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right',
                                figure_size=(16, 12)))
shannon_plt

## Beta Diversity
Now we'll Look beta diversity among samples, by calculating Bray-Curtis dissimilarity

In [None]:
!qiime diversity beta \
    --i-table rarefied_table.qza \
    --p-metric braycurtis \
    --o-distance-matrix braycurtis.qza

In [None]:
!qiime tools export \
  --input-path braycurtis.qza \
  --output-path braycurtis

## Calculate PCoA
We'll read in our dissimilarity matrix, and conduct a PCoA in SkBio 

In [None]:
dm = pd.read_csv('/proj/gibbons/nbohmann/metabug/manifest/braycurtis/distance-matrix.tsv', sep = '\t',index_col = 0)
od_res = skbio.stats.ordination.pcoa(dm)
od_res.proportion_explained

## Add Metadata
Now we can merge our PCoA results with corresponding metadata, and examine the proportion explained. 

In [None]:
od_res.samples.index = dm.index
metadata = metadata[metadata.index.isin(od_res.samples.index)]
od_res.samples['region'] = od_res.samples.index.map(metadata['region'].to_dict())
od_res.samples['age'] = od_res.samples.index.map(metadata['age'].to_dict())
od_res.samples['sex'] = od_res.samples.index.map(metadata['sex'].to_dict())
od_res.samples['URT'] = od_res.samples.index.map(metadata['URT'].to_dict()).str.replace('NP','Nasopharynx').str.replace('OP','Oropharynx')
od_res.samples['disease'] = od_res.samples.index.map(metadata['disease'].to_dict())
od_res.samples['country'] = od_res.samples.index.map(metadata['country'].to_dict())
od_res.samples['condition'] = od_res.samples.index.map(metadata['condition'].to_dict())
od_res.proportion_explained

## Plot Beta Diversity

In [None]:
plt = (ggplot(
    od_res.samples, aes(x = 'PC1', y = 'PC2', color = 'URT'))
    +geom_point(size = 3)
    +labs(x = 'PC1 (20.29%)', y = 'PC2 (10.07%)', color = 'URT Area')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',
                                figure_size=(8, 6)))
plt

## PERMANOVA
Use PERMANOVA to calculate significance

In [None]:
import pandas
import skbio
dm = pd.read_csv('/proj/gibbons/nbohmann/metabug/manifest/braycurtis/distance-matrix.tsv', sep = '\t',index_col = 0)
dm = skbio.stats.distance.DistanceMatrix(dm)
metadata_array= np.ascontiguousarray(metadata['URT'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)

In [None]:
metadata_array= np.ascontiguousarray(metadata['condition'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)

In [None]:
metadata_array= np.ascontiguousarray(metadata['region'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)

In [None]:
metadata_array= np.ascontiguousarray(metadata['disease'].to_list())
skbio.stats.distance.permanova(dm, grouping = metadata_array, permutations = 999)