We'll start our microbiome analysis by doing a high-level overview of the categories of interest. This will let us determine what categories it might be interesting to pursue.

In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import scipy
import skbio
import seaborn as sn
import matplotlib.pyplot as plt
from skbio.stats.power import subsample_power, confidence_bound

import americangut.ag_dictionary as agdic
from americangut.ag_data import AgData
import americangut.notebook_environment as agenv
import americangut.power_plots as agpp



We'll start by selecting the dataset we plan to use. We need to pick a bodysite, or location 

In [2]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = True
use_one_sample = True

Next, we'll load the data, and remove outliers.

In [3]:
data = AgData(bodysite=bodysite, 
              trim=sequence_trim, 
              depth=rarefaction_depth, 
              sub_participants=use_subset, 
              one_sample=use_one_sample)

We'll also pick a metric to examine. This can be an alpha diveristy metric or a beta diversity metric. We'll also pick a function. If the metric selected is an alpha diversity metric (i.e. `PD_whole_tree_10k`, `shannon_10k`, `chao1_1k`), the test should be an alpha diversity test. A default alpha diversity test can be found in `americangut.power_plots.ag_alpha_test`. If the metric selected is a beta diversity metric (i.e. `unweighted_unifrac`), the test should handle distance matrices. A default test is provided in `americangut.power_plots.ag_beta_test`.

In [4]:
metric = 'weighted_unifrac'
test = agpp.ag_beta_test

Next, let's set up a directory where we will save our results.

In [5]:
save_dir = agenv.check_save_dir(data.data_set)
summary_fp = os.path.join(save_dir, 'summary_%s.p' % metric)
all_power_dir = os.path.join(save_dir, 'power/all/%s' % metric)
ext_power_dir = os.path.join(save_dir, 'power/extreme/%s' % metric)
agenv.check_dir(all_power_dir)
agenv.check_dir(ext_power_dir)

Next, let's write a quick function to summarize the data for all groups in the data.

In [6]:
def generate_summary(question, data, test, metric):
    """Summarizes information about the results"""
    results = {'name': question.name}
    
    # Gets the map, otu table and distance matrix for the question
    data.reload_files()
    data.drop_alpha_outliers()
    data.drop_bmi_outliers()
    data.clean_age()
    data.clean_group(question)
    if question.type == 'Clinical':
        question.order = ['Yes', 'No']
        question.extremes = ['Yes', 'No']
    data.filter_by_question(question)
    
    # Identfies the samples within each group
    grouped = data.map_.groupby(question.name).groups
    group_ids = [grouped[o] for o in question.order]
    extreme_ids = [grouped[o] for o in question.extremes]
    
    # Provides summary information
    results['all_groups'] = question.order
    results['all_size'] = [len(id_) for id_ in group_ids]
    results['ext_groups'] = question.extremes
    results['ext_size'] = [len(id_) for id_ in extreme_ids]

    # Calculates the p value for all the samples
    results['all_p_value'] = test(metric, question, data, group_ids,
                                  permutations=999)
    results['ext_p_value'] = test(metric, question, data, extreme_ids,
                                  permutations=999)
    
    # Calculates statistical power or loads it from file
    all_power_fp = os.path.join(all_power_dir, '%s.p') % question.name
    ext_power_fp = os.path.join(ext_power_dir, '%s.p') % question.name
    
#     if not os.path.exists(all_power_fp):
#         a_power, a_counts = subsample_power(
#             test=lambda x: test(metric, question, data, x, permutations=99),
#             samples=group_ids,
#             min_counts=5,
#             counts_interval=10,
#             max_counts=60,
#             num_runs=5,
#             num_iter=500,
#             )
#         with open(os.path.join(all_power_fp), 'w') as f_:
#             pickle.dump((question.name, question.order, metric, a_power, a_counts),
#                         f_)
#     else:
    with open(os.path.join(all_power_fp), 'r') as f_:
        (name, order, metric, a_power, a_counts) = pickle.load(f_)

    # Calculates the effect size
    a_eff = agpp.z_effect(a_counts, a_power)
    if (a_power > 0.2).any():
        results['all_effect_mean'] = a_eff.mean()
        results['all_effect_ci'] = confidence_bound(a_eff)
    else:
        results['all_effect_mean'] = np.nan
        results['all_effect_ci'] = np.nan
        
#     if not os.path.exists(ext_power_fp):
#         e_power, e_counts = subsample_power(
#             test=lambda x: test(metric, question, data, x, permutations=99),
#             samples=extreme_ids,
#             min_counts=5,
#             counts_interval=10,
#             max_counts=60,
#             num_runs=5,
#             num_iter=500,
#             )
#         with open(os.path.join(ext_power_fp), 'w') as f_:
#             pickle.dump((question.name, question.order, metric, e_power, e_counts),
#                         f_)
#     else:
    with open(os.path.join(ext_power_fp), 'r') as f_:
        (name, order, metric, e_power, e_counts) = pickle.load(f_)

    # Calculates the effect size
    e_eff = agpp.z_effect(e_counts, e_power)
    if (a_power > 0.2).any():
        results['ext_effect_mean'] = e_eff.mean()
        results['ext_effect_ci'] = confidence_bound(e_eff)
    else:
        results['ext_effect_mean'] = np.nan
        results['ext_effect_ci'] = np.nan
    
    return results

Now, let's apply that function to our data, and save the output.

In [8]:
summary = []
for name in sorted(agdic.dictionary.keys()):
# for name in ['ANTIBIOTIC_HISTORY', 'BMI_CAT', 'IBD']:
    if use_subset and name in {'ANTIBIOTIC_HISTORY', 'DIABETES', 'BMI_CAT', 'IBD'}:
        continue
    question = agdic.ag_dictionary(name)
    if question.type == 'Continous':
        continue
    print name
    try:
        % time results = generate_summary(question, data, test, metric) 
    except:
        pass
    summary.append(results)

AGE_CAT
CPU times: user 47.9 s, sys: 10.3 s, total: 58.2 s
Wall time: 58.6 s
ALCOHOL_FREQUENCY
CPU times: user 45.3 s, sys: 10.3 s, total: 55.6 s
Wall time: 55.9 s
ALCOHOL_TYPES_BEERCIDER
CPU times: user 15.7 s, sys: 626 ms, total: 16.4 s
Wall time: 16.5 s
ALCOHOL_TYPES_RED_WINE
CPU times: user 17.1 s, sys: 1.3 s, total: 18.4 s
Wall time: 18.5 s
ALCOHOL_TYPES_SOUR_BEERS
CPU times: user 20.3 s, sys: 2.34 s, total: 22.6 s
Wall time: 22.7 s
ALCOHOL_TYPES_SPIRITSHARD_ALCOHOL
CPU times: user 15.8 s, sys: 544 ms, total: 16.4 s
Wall time: 16.5 s
ALCOHOL_TYPES_WHITE_WINE
CPU times: user 15.7 s, sys: 396 ms, total: 16.1 s
Wall time: 16.2 s
BOWEL_MOVEMENT_FREQUENCY
CPU times: user 11.3 s, sys: 673 ms, total: 12 s
Wall time: 12 s
BOWEL_MOVEMENT_QUALITY
CPU times: user 19.6 s, sys: 2.1 s, total: 21.7 s
Wall time: 21.8 s
CAT
CPU times: user 1min 51s, sys: 28.1 s, total: 2min 19s
Wall time: 2min 19s
CHICKENPOX
CPU times: user 1min 58s, sys: 33.2 s, total: 2min 31s
Wall time: 2min 32s
COLLECTION_MONT

Finally, let's look for categories that have clear effect sizes.

In [9]:
summary_df = pd.DataFrame(summary).set_index('name')
summary_df.to_csv('/Users/jwdebelius/Desktop/subset_%s.txt' % metric, sep='\t', na_rep='NA')


In [None]:
summary_df