# Covariate Analysis 

In this notebook we'll use the output of our QIIME2 workflow to investigate differences in URT composition between covariates of geographic region, age and sex

In [None]:
from utils import #
import pandas as pd
import qiime2 as q2
import biom
from plotnine import *
import skbio
import scipy.stats
import seaborn as sns
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection
import os
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties


%matplotlib inline

## Build Dataframe
First we'll build a dataframe of all reads in the meta-analysis pertaining to healthy samples

In [None]:
collapse_on=["kingdom", "phylum", "class", "order", "family", "genus"]
os.chdir('/proj/gibbons/nbohmann/metabug/manifest/NP/')
manifest_NP = pd.read_csv('NP_manifest.csv',index_col= 0, header = None)
res = pd.DataFrame()
for file_name in manifest_NP.index:
    #pull out feature tables with total reads
    ab = qiime_to_dataframe(feature_table="qiime/"+file_name+"_table.qza",
                        taxonomy="qiime/"+file_name+"_taxonomy.qza", 
                        collapse_on=collapse_on) 
    #merge with metadata
    meta = pd.read_csv('metadata/'+file_name+'_metadata.tsv', sep="\t")
    meta.rename(columns={meta.columns[0]: "sample_id"}, inplace=True)
    ab = pd.merge(ab, meta, on="sample_id")
    ab['URT'] = 'NP'
    ab['Study'] = file_name
    #concatenate dataframes together
    res = pd.concat([res,ab])
os.chdir('/proj/gibbons/nbohmann/metabug/manifest/OP/')
manifest_OP = pd.read_csv('OP_manifest.csv',index_col= 0, header = None)
for file_name in manifest_OP.index:
    #pull out feature tables with total reads
    ab = qiime_to_dataframe(feature_table="qiime/"+file_name+"_table.qza",
                        taxonomy="qiime/"+file_name+"_taxonomy.qza", 
                        collapse_on=collapse_on) 
    #merge with metadata
    meta = pd.read_csv('metadata/'+file_name+'_metadata.tsv', sep="\t")
    meta.rename(columns={meta.columns[0]: "sample_id"}, inplace=True)
    ab = pd.merge(ab, meta, on="sample_id")
    ab['URT'] = 'OP'
    ab['Study'] = file_name
    #concatenate dataframes together
    res = pd.concat([res,ab])
res = res.dropna(subset = ['genus']) #filter data 
res = res[res.condition == 'control'] #filter to just controls
res = res[~(res.genus.str.contains('None'))&~(res.genus.str.contains('uncultured'))&~(res.genus.str.contains('Chloroplast'))]
res_sex = res[~pd.isnull(res.sex)] #res_sex has all samples with sex metadata
res_sex = clr(filter_taxa(res_sex, min_reads=2, min_prevalence=0.05)) #center-log-ratio transformation and filtering
res_age = res[~pd.isnull(res.age)] #res_age has all samples with age metadata
res_age = clr(filter_taxa(res_age, min_reads=2, min_prevalence=0.05)) #center-log-ratio transformation and filtering
res_metadata = res[~(pd.isnull(res.age))&~pd.isnull(res.sex)]
res_metadata = clr(filter_taxa(res_metadata,min_reads = 2, min_prevalence = 0.5))
res = clr(filter_taxa(res, min_reads=2, min_prevalence=0.5)) #center-log-ratio transformation and filtering

## Geographic location
For location, we'll run a Kruskal-Wallis to look for taxa significantly enriched between locations

In [None]:
collapse_on=["kingdom", "phylum", "class", "order", "family", "genus"]
collapsed = res.groupby(collapse_on)
results = kwtests(collapsed,'region')
for_plots = res[res['genus'].isin(results[results['q']<0.05]['taxon'])].groupby(
    ["kingdom", "phylum", "class", "order", "family","genus","region"]).mean().reset_index()
to_drop = for_plots.groupby(collapse_on)['relative'].mean()[for_plots.
            groupby(collapse_on)['relative'].mean()<0.01].reset_index()['genus']
for_plots = for_plots[~for_plots['genus'].isin(to_drop)]
for_plots.genus = for_plots.genus.str.split('_').str[0]
for_plots.genus = for_plots.genus.str.split('-').str[0]
for_plots = pd.pivot_table(for_plots, index = 'genus', columns = 'region', values = 'clr').fillna(0.0)
for_plots

## Run Post-Hocs

In [None]:
import scikit_posthocs
result = pd.DataFrame()
for genus in res['genus'].unique():
    dunn = scikit_posthocs.posthoc_dunn(res[res['genus']==genus], val_col = 'clr', group_col = 'region')
    indices, columns = np.where(dunn.lt(0.05))
    pairs = list(zip(indices,columns))
    pairs_labels = pd.DataFrame([(dunn.index[i], dunn.columns[j]) for i,j in pairs])
    pairs_labels['taxon'] = genus
    result = pd.concat([result,pairs_labels])

## Visualize Results

In [None]:
fig, ax = plt.subplots(nrows=1,
                               figsize=(14, 7))

sns.set(font_scale=1)
sns.heatmap(for_plots.T,
                    cmap=sns.diverging_palette(240,20,center='light',as_cmap=True),
                    ax = ax
                    )

font_props = FontProperties().copy()
font_props.set_size(15)

ax.set_yticklabels(ax.get_ymajorticklabels(), fontproperties=font_props)
font_props.set_style("italic")
ax.set_xticklabels(ax.get_xmajorticklabels(), fontproperties=font_props)


plt.yticks(rotation=0)
plt.tight_layout()


## Age 

For age, we will use regression analysis to find associations, correcting for URT sampling site and geographic region

In [None]:
results_age = pd.DataFrame()
res_age['age2'] = np.square(res_age['age'])
for taxon in res_age['genus'].unique():
    temp = res_age[res_age['genus'].str.startswith(taxon)]
    if min(temp.region.value_counts())<5: 
        temp = temp[~temp.region.str.contains(temp.region.value_counts()[temp.region.value_counts()<5].index[0])]
    md = smf.ols("clr ~ age + age2 + URT + region",temp)
    mdf = md.fit()
    pval = mdf.pvalues['age']
    pval2 = mdf.pvalues['age2']
    results_age = pd.concat([results_age, pd.DataFrame({'genus':[taxon],'p':[pval], 'p2':[pval2]})])
results_age = results_age.dropna(subset = ['p', 'p2'])
results_age['q']= fdrcorrection(results_age['p'])[1]
results_age['q2']= fdrcorrection(results_age['p2'])[1]
results_age[(results_age['q']<0.05)|(results_age['q2']<0.05)].sort_values(by = 'q2')
results_age['genus'] = results_age['genus'].str.split('_').str[0]
results_age

## Break into quantiles
We will break the data into age quantiles for visualization

In [None]:
res_age['quantile'] = pd.qcut(res_age['age'], [0,.2,.4,.6,.8,1.0], labels = ['0-1','1-4','4-7','7-26','26-86'])
for_plots = res_age[res_age['genus'].isin(results_age[results_age['q']<0.05]['genus'])].groupby(
    ["kingdom", "phylum", "class", "order", "family","genus", "quantile"]).mean().reset_index()
for_plots = for_plots.groupby(['genus','quantile'])['clr'].mean().reset_index()
for_plots = pd.pivot(for_plots, index = 'genus', columns = 'quantile', values = 'clr')

## Visualize Results

In [None]:
fig, ax = plt.subplots(nrows=1,
                               figsize=(14, 7))

sns.set(font_scale=1)
sns.heatmap(for_plots.T,
                    cmap=sns.diverging_palette(240,20,center='light',as_cmap=True),
                    ax = ax
                    )

font_props = FontProperties().copy()
font_props.set_size(15)

ax.set_yticklabels(ax.get_ymajorticklabels(), fontproperties=font_props)
font_props.set_style("italic")
ax.set_xticklabels(ax.get_xmajorticklabels(), fontproperties=font_props)


plt.yticks(rotation=0)
plt.tight_layout()

## Sex
We will do the same regression analysis with sex as a covariate

In [None]:
results_sex = pd.DataFrame()
for taxon in res_sex['genus'].unique():
    temp = res_sex[res_sex['genus'].str.startswith(taxon)]
    if min(temp.region.value_counts())<5: 
        temp = temp[~temp.region.str.contains(temp.region.value_counts()[temp.region.value_counts()<5].index[0])]
    md = smf.ols("clr ~ sex + URT + region",temp)
    mdf = md.fit()
    pval = mdf.pvalues['sex[T.M]']
    results_sex = pd.concat([results_sex, pd.DataFrame({'genus':[taxon],'p':[pval]})])
results_sex = results_sex.dropna(subset = ['p'])
results_sex['q']= fdrcorrection(results_sex['p'])[1]
results_sex[(results_sex['q']<0.05)]
results_sex['genus'] = results_sex['genus'].str.split('_').str[0]
results_sex.sort_values(by = 'q')

No significant results were seen