This notebook will let us examine alpha diversity for a category in depth. We're already summarized the alpha diveristy p values for all categories in a [previous notebook](). Here, we can do post-hoc testing and look at the alpha diversity values associated with a single category.

Let's start by importing the modules and functions we'll need to the analysis.

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import scipy.stats
import skbio

import americangut.ag_dictionary as agdic
import americangut.diversity_analysis as agdiv
import americangut.notebook_environment as agenv
from americangut.ag_data import AgData

In [None]:
from matplotlib import rcParams

% matplotlib inline

# Formats the axes using seabron so they will be white, and have ticks
# on the bottom of the axes.
sn.set_style('ticks', {'axes.facecolor': 'none'})

# Sets up plotting parameters so that the default setting is use to Helvetica
# in plots
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Helvetica', 'Arial']
rcParams['text.usetex'] = True

Next, let's select the data set and rarefaction depth we wish to use.

In [None]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = True
use_one_sample = True

Next, we'll load the data, and remove outliers.

In [None]:
data = AgData(bodysite=bodysite, 
                    trim=sequence_trim, 
                    depth=rarefaction_depth, 
                    sub_participants=use_subset, 
                    one_sample=use_one_sample)

data.drop_alpha_outliers()
data.drop_bmi_outliers()
data.clean_age()

In [None]:
quality = agdic.ag_dictionary('BOWEL_MOVEMENT_QUALITY')
ibs = agdic.ag_dictionary('IBS')
ibs.strict = False
data.clean_group(quality)
data.clean_group(ibs)

In [None]:
data.map_.groupby(ibs.name).count().max(1)

In [None]:
data.map_.groupby(quality.name).count().max(1)

In [None]:
ibs_stack = data.map_.groupby(('IBS', 'BOWEL_MOVEMENT_QUALITY')).count().max(1)

In [None]:
chi, p, df, pred = scipy.stats.chi2_contingency(ibs_stack.unstack().values)

In [None]:
ibs_stack.unstack()

In [None]:
pred

In [None]:
help(scipy.stats.chi2_contingency)

In [None]:
ibd = {u'10317.000012951', u'10317.000017746', u'10317.000001620',
       u'10317.000001748', u'10317.000009382', u'10317.000001544',
       u'10317.000005953', u'10317.000009004', u'10317.000013010',
       u'10317.000014579', u'10317.000001827', u'10317.000013023',
       u'10317.000009157', u'10317.000001362', u'10317.000001135',
       u'10317.000004855', u'10317.000005878', u'10317.000007719',
       u'10317.000009126', u'10317.000007068', u'10317.000010134',
       u'10317.000010112', u'10317.000003300', u'10317.000002830',
       u'10317.000014236', u'10317.000012936', u'10317.000013575',
       u'10317.000001047', u'10317.000005360', u'10317.000001333',
       u'10317.000015581', u'10317.000003189', u'10317.000001128',
       u'10317.000015907', u'10317.000017338', u'10317.000013062',
       u'10317.000013595', u'10317.000009533', u'10317.000010546',
       u'10317.000001685', u'10317.000004163', u'10317.000001751',
       u'10317.000013551', u'10317.000005889', u'10317.000002036',
       u'10317.000002783', u'10317.000004157', u'10317.000001364',
       u'10317.000001622', u'10317.000004783', u'10317.000008999',
       u'10317.000009149', u'10317.000001895', u'10317.000011375',
       u'10317.000014880', u'10317.000001291', u'10317.000010876',
       u'10317.000014607', u'10317.000014987', u'10317.000002271',
       u'10317.000003898', u'10317.000009144', u'10317.000002482',
       u'10317.000014608', u'10317.000013134', u'10317.000001647',
       u'10317.000011959', u'10317.000009626', u'10317.000004025',
       u'10317.000014291', u'10317.000001351', u'10317.000002859',
       u'10317.000014118', u'10317.000004612', u'10317.000018383',
       u'10317.000001575', u'10317.000004161', u'10317.000015873',
       u'10317.000005971', u'10317.000004192', u'10317.000001322',
       u'10317.000004162', u'10317.000002336', u'10317.000010147',
       u'10317.000006673', u'10317.000004752', u'10317.000005851',
       u'10317.000009236', u'10317.000001363', u'10317.000014458',
       u'10317.000011093', u'10317.000009164', u'10317.000005810',
       u'10317.000003047', u'10317.000015849', u'10317.000004790',
       }
actual = set(ibd).intersection(data.map_.index)
# len(actual)
data.map_.loc[actual, 'IBD_DIAGONOSIS'] = 'Yes'

Next, let's set up a directory where we will save our results.

In [None]:
# save_dir = agenv.check_save_dir(fecal_data.data_set)
save_dir =  '/Users/jdebelius/Desktop/'
fig_dir = os.path.join(save_dir, 'alpha/images/')
tab_dir = os.path.join(save_dir, 'alpha/summary/')

if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)
if not os.path.exists(tab_dir):
    os.makedirs(tab_dir)

Finally, we can pick the alpha diversity metric, the category to interogate, and set and order, if one is relevant.

In [None]:
metric = 'PD_whole_tree'

# group_name = 'COUNTRY'

Now, let's read the files assoicated with the data and load the data dictionary entry for the group.

In [None]:
for group_name in agdic.dictionary.iterkeys():
    group = agdic.ag_dictionary(group_name)
    data.clean_group(group)

In [None]:
data.map_.loc[data.map_.IBD_DIAGONOSIS == 'Yes', 'IBD'] = 'Yes'

In [None]:
data.map_.groupby('IBD').count().max(1)

In [None]:
data.map_.to_csv('/Users/jdebelius/Desktop/clean_subset.txt',
                       sep='\t', index_label='#SampleID')

In [None]:
fecal_data.clean_group(group)
group.remap_groups(fecal_data.map_)

We're going to start by cleaning up the data. So, let's remove any samples that might be outliers (in rounds 1-21, there is a sample with alpha diveristy seven standard deivations above the mean and 4 standard deviations about the next highest sample).

We'll also clean up the mapping column as needed, to make analsyis easier.

Now that we have the data loaded, let's plot it.

In [None]:
with sn.color_palette(sn.xkcd_palette(['coral', 'ocean blue'])):
    ax = agdiv.pretty_pandas_boxplot(meta=fecal_data.map_,
                                     group=group.name,
                                     metric='%s_%s' % (metric, rarefaction_depth),
                                     order=group.order,
    #                                  bw=True,
                                     colors=['#2b8cbe', '#bd0026'],
                                     ylim=[5, 55],
                                     xlabel=group.clean_name,
                                     xticklabels=group.order,
                                     ylabel='PD Whole Tree Diversity',
                                     show_p=True,
                                     )
    ax.set_position((0.2, 0.2, 0.15*len(group.order), 0.75))

    ax.figure.savefig(os.path.join(fig_dir, '%s.pdf' % group.name))

We can also examine significant differences using a post-hoc test to see what drives these differences.

In [None]:
post_hoc = agdiv.post_hoc_pandas(meta=fecal_data.map_,
                                 group=group.name,
                                 cat='%s_%s'% (metric, rarefaction_depth),
                                 order=group.order,
                                 correct='bonferroni',
                                 )
post_hoc

In [None]:
quality = agdic.ag_dictionary('BOWEL_MOVEMENT_QUALITY')
frequency = agdic.ag_dictionary('BOWEL_MOVEMENT_FREQUENCY')

fecal_data.clean_group(quality)
fecal_data.clean_group(frequency)