The purpose of this notebook is to evaluate a set of metadata categories to identify factors which may be interesting to pursue further.

In [72]:
import os
import pickle

import numpy as np
import scipy.stats
import skbio

from multiprocessing import Pool

from skbio.stats.power import subsample_power, confidence_bound

from americangut.ag_data_dictionary import *
from americangut.ag_data import AgData

Let's select a dataset to use.

In [2]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = False
use_one_sample = True

Next, let's select a list of groups to interogate.

In [3]:
fecal_data = AgData(bodysite=bodysite, 
                    trim=sequence_trim, 
                    depth=rarefaction_depth, 
                    sub_participants=use_subset, 
                    one_sample=use_one_sample)
fecal_data.drop_alpha_outliers()


fecal_data.drop_bmi_outliers()

In [None]:
def _remap_abx(x):
    if x == 'I have not taken antibiotics in the past year':
        return 'More than 1 year'
def _remap_bowel_frequency(x):
    if x in {'Four', 'Five or more'}:
        return 'Four or more'
    else:
        return np.nan
def _remap_fecal_quality(x):
    if x == 'I tend to be constipated (have difficulty passing stool)':
        return 'Constipated'
    elif x == 'I tend to have diarrhea (watery stool)':
        return 'Diarrhea'
    elif x == 'I tend to have normal formed stool':
        return 'Normal'
    else:
        return x
def _remap_contraceptive(x):
    if isinstance(x, str):
        return x.split(',')[0]
    else:
        return x

In [22]:
def generate_summary(group):
    fecal_data.clean_up_column(group)
    
    if group.type in {'Order', 'Frequency'}:
        order = group.order
    elif groupe.type in {'Categorical', 'Bool', 'Clinical'}:
        order = list(group.groups)
        
    results = {'name': group.name,
               'groups': order,
               'extremes': group.extremes}
    
    map_, otu_, beta = fecal_data.return_dataset(group)
    
    grouped = map_.groupby(group.name).groups
    group_ids = [grouped[o] for o in order]
    extreme_ids = [grouped[o] for o in group.extremes]
    
    for metric in ['shannon_10k', 'PD_whole_tree_10k', 'observed_otus_10k', 'chao1_10k']:
        results['%s_p_all' % metric] = alpha_test(metric, group_ids)
        results['%s_p_extreme' % metric] = alpha_test(metric, extreme_ids)
        
        a_power, a_counts = subsample_power(
            test=lambda x: alpha_test(metric, x),
            samples=group_ids,
            draw_mode=matched,
            min_counts=5,
            counts_interval=5,
            max_counts=5,
            num_runs=5,
            num_iter=500,
            )
        results['%s_power_extreme'] = subsample_power(
            test=lambda x: alpha_test(metric, x),
            samples=extreme_ids,
            draw_mode=matched,
            min_counts=5,
            counts_interval=5,
            max_counts=5,
            num_runs=5,
            num_iter=500,
            )
    for metric in ['unweighted_unifrac', 'weighted_unifrac']:
        results['%s_p_all' % metric.split('_')[0]] = \
            beta_test(metric, group, group_ids, permutations=999)
        results['%s_p_extreme' % metric.split('_')[0]] = \
            beta_test(metric, group, extreme_ids, permutations=999)
        results['%s_power_all']

In [26]:
# for name, group in data_dictionary.iteritems():
#     if group.type == 'AgContinous':
#         continue
name = 'AGE_CAT'
group = data_dictionary['AGE_CAT']

if group.type in {'Ordinal', 'Frequency'}:
    order = group.order
elif group.type in {'Categorical', 'Bool', 'Clinical'}:
    order = list(group.groups)
    
results = {'name': group.name,
           'groups': order,
           'extremes': group.extremes}
map_, otu_, beta = fecal_data.return_dataset(group)

grouped = map_.groupby(group.name).groups
group_ids = [grouped[o] for o in order]
extreme_ids = [grouped[o] for o in group.extremes]

In [86]:
save_dir = '/Users/jwdebelius/Desktop/ag_summary/'

In [88]:
os.path.exists(os.path.join(save_dir, 'all_power/'))

True

In [91]:
for metric in ['PD_whole_tree_10k', 'shannon_10k']:
    results['%s_p_all' % metric] = alpha_test(metric, group_ids)
    results['%s_p_extreme' % metric] = alpha_test(metric, extreme_ids)

    a_power, a_counts = subsample_power(
                test=lambda x: alpha_test(metric, x),
                samples=group_ids,
                min_counts=10,
                counts_interval=10,
                max_counts=50,
                num_runs=5,
                num_iter=500,
                )
    with open(os.path.join(save_dir, 'all_power/%s/%s.p' % (metric, metric)), 'w') as f_:
        pickle.dump((name, order, metric, a_power, a_counts), f_)
    results['%s_eff_all' % metric] = (z_effect(a_counts, a_power).mean(),
                                      confidence_bound(z_effect(a_counts, a_power)))
    
    e_power, e_counts = subsample_power(
            test=lambda x: alpha_test(metric, x),
            samples=extreme_ids,
            min_counts=10,
            counts_interval=10,
            max_counts=50,
            num_runs=5,
            num_iter=500,
            )
    with open(os.path.join(save_dir, 'extreme_power/%s/%s.p' % (metric, metric)), 'w') as f_:
        pickle.dump((name, order, metric, a_power, a_counts), f_)
    results['%s_eff_ext' % metric] = (z_effect(e_counts, e_power).mean(), 
                                      confidence_bound(z_effect(e_counts, e_power)))

In [None]:
for metric in ['unweighted', 'weighted']:
    results['%s_unifrac_p_all' % metric] = beta_test(metric, group, group_ids, 999)
    results['%s_unifrac_p_extreme' % metric] = beta_test(metric, group, extreme_ids, 999)

    a_power, a_counts = subsample_power(
                test=lambda x: beta_test(metric, group, group_ids, 99),
                samples=group_ids,
                min_counts=10,
                counts_interval=10,
                max_counts=50,
                num_runs=5,
                num_iter=500,
                )
    with open(os.path.join(save_dir, 'all_power/%s_unifrac/%s.p' % (metric, metric)), 'w') as f_:
        pickle.dump((name, order, metric, a_power, a_counts), f_)
    results['%s_unifrac_eff_all' % metric] = (z_effect(a_counts, a_power).mean(),
                                              confidence_bound(z_effect(a_counts, a_power)))
    
    e_power, e_counts = subsample_power(
            test=lambda x: beta_test(metric, group, group_ids, 99),
            samples=extreme_ids,
            min_counts=10,
            counts_interval=10,
            max_counts=50,
            num_runs=5,
            num_iter=500,
            )
    with open(os.path.join(save_dir, 'extreme_power/%s_unifrac/%s.p' % (metric, metric)), 'w') as f_:
        pickle.dump((name, order, metric, a_power, a_counts), f_)
    
    results['%s_unifrac_eff_ext' % metric] = (z_effect(e_counts, e_power).mean(), 
                                              confidence_bound(z_effect(e_counts, e_power)))

In [94]:
help(beta_test)

Help on function beta_test in module __main__:

beta_test(metric, group, ids=None, permutations=249)



In [92]:
beta.keys()

['unweighted', 'weighted']

In [46]:
import matplotlib.pyplot as plt
% matplotlib inline

In [63]:
import numpy as np

import scipy.stats

from statsmodels.stats.power import FTestAnovaPower
from scipy.stats import norm as z
ft = FTestAnovaPower()


def extrapolate_f(counts, pwr_, alpha=0.05):
    """Converts emperical power to extrapolated

    Parameters
    ----------
    counts : array
        The number of observations which should be used in the final power
        result.
    pwr_ : array
        The observed power. Each column corresponds to the number of
        observations used in `cnts`. The rows correspond to different runs
    cnts : array
        The number of observations drawn to calculate the observed power.
    alpha : float, optional
        The critical value for power calculations.

    Returns
    -------
    power : array
        The extrapolated power for the number of observations given by `counts`

    """
    # Gets the average emperical effect size
    effs = np.zeros(pwr_.shape) * np.nan
    for idx, pwr in enumerate(pwr_):
        for idy, cnt in enumerate(counts):
            try:
                effs[idx, idy] = ft.solve_power(None, cnt, alpha, pwr[idy])
            except:
                pass
    return effs
#     eff_mean = np.nanmean(effs)
    # Calculates the extrapolated power curve
#     extr_pwr = ft.solve_power(effect_size=eff_mean,
#                               nobs=counts,
#                               alpha=0.05,
#                               power=None)

#     return extr_pwr

def z_effect(counts, power, alpha=0.05):
    """Estimates the effect size for power based on the z distribution

    This is based on the equations in
        Lui, X.S. (2014) *Statistical power analysis for the social and
        behavioral sciences: basic and advanced techniques.* New York:
        Routledge. 378 pg.
    The equation assumes a positive magnitude to the effect size and a
    two-tailed test.

    Parameters
    ----------
    counts : array
        The number of observations for each power depth
    power : array
        The statistical power at the depth specified by `counts`
    alpha : float
        The critial value used to calculate the power

    Returns
    effect : array
        T A standard measure of the difference between the underlying
        populations
    """
    z_diff = z.ppf(power) + z.ppf(1 - alpha / 2)
    eff = np.sqrt(2 * np.square(z_diff) / counts)
    eff = eff[np.isinf(eff) == False]
    return eff


def z_power(counts, eff, alpha=0.05):
    """Estimates power for a z distribution from an effect size

    This is based on the equations in
        Lui, X.S. (2014) *Statistical power analysis for the social and
        behavioral sciences: basic and advanced techniques.* New York:
        Routledge. 378 pg.
    The equation assumes a positive magnitude to the effect size and a
    two-tailed test.

    Parameters
    ----------
    counts : array
        The number of observations for each power depth
    effect : float
        A standard measure of the difference between the underlying populations
     alpha : float
        The critial value used to calculate the power

    Returns
    power : array
        The statistical power at the depth specified by `counts`

    """
    power = ((z.cdf(eff * np.sqrt(counts/2) - z.ppf(1 - alpha/2)) +
             (z.cdf(z.ppf(alpha/2) - eff * np.sqrt(counts/2)))))
    return power


In [101]:
def beta_test(metric, group, ids, permutations=249):
    ids = np.hstack(ids)
    
    beta_p = skbio.stats.distance.permanova(
        distance_matrix=fecal_data.beta[metric].filter(ids),
        grouping=fecal_data.map_.loc[ids],
        column=group.name,
        permutations=permutations,
    )
    return beta_p['p-value']

In [98]:
help(skbio.stats.distance.permanova)

Help on function permanova in module skbio.stats.distance._permanova:

permanova(distance_matrix, grouping, column=None, permutations=999)
    Test for significant differences between groups using PERMANOVA.
    
    State: Experimental as of 0.4.0.
    
    Permutational Multivariate Analysis of Variance (PERMANOVA) is a
    non-parametric method that tests whether two or more groups of objects
    (e.g., samples) are significantly different based on a categorical factor.
    It is conceptually similar to ANOVA except that it operates on a distance
    matrix, which allows for multivariate analysis. PERMANOVA computes a
    pseudo-F statistic.
    
    Statistical significance is assessed via a permutation test. The assignment
    of objects to groups (`grouping`) is randomly permuted a number of times
    (controlled via `permutations`). A pseudo-F statistic is computed for each
    permutation and the p-value is the proportion of permuted pseudo-F
    statisics that are equal to or 

In [None]:
map_.groupby('AGE_CAT').groups

In [None]:
results = {}
partition_samples = [
for a_metric in alpha_metrics:
    results['%s_p' % a_metric] = alpha_test(a_metric, group)
    results['%s_power' % a_metric] = subsample_power(
        test=lambda x: alpha_test(metric, group, x),
        
    )

In [None]:
map_.groupby(group.name).groups.values()

In [9]:
def alpha_test(metric, ids):
    alpha = [map_.loc[i, metric] for i in ids]
    return scipy.stats.kruskal(*alpha)[1]

In [56]:
from skbio.stats.power import confidence_bound

In [57]:
help(confidence_bound)

Help on function confidence_bound in module skbio.stats.power:

confidence_bound(vec, alpha=0.05, df=None, axis=None)
    Calculates a confidence bound assuming a normal distribution
    
    State: Experimental as of 0.4.0.
    
    Parameters
    ----------
    vec : array_like
        The array of values to use in the bound calculation.
    alpha : float, optional
        The critical value, used for the confidence bound calculation.
    df : float, optional
        The degrees of freedom associated with the
        distribution. If None is given, df is assumed to be the number of
        elements in specified axis.
    axis : positive int, optional
        The axis over which to take the deviation. When axis
        is None, a single value will be calculated for the whole matrix.
    
    Returns
    -------
    bound : float
        The confidence bound around the mean. The confidence interval is
        [mean - bound, mean + bound].

