The purpose of this notebook is to evaluate a set of metadata categories to identify factors which may be interesting to pursue further.

In [1]:
import os
import pickle

import numpy as np
import scipy.stats
import skbio

from multiprocessing import Pool

from skbio.stats.power import subsample_power, confidence_bound

from americangut.ag_data_dictionary import ag_data_dictionary
from americangut.ag_data import AgData

Let's select a dataset to use.

In [2]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = False
use_one_sample = True

Next, let's select a list of groups to interogate.

In [3]:
fecal_data = AgData(bodysite=bodysite, 
                    trim=sequence_trim, 
                    depth=rarefaction_depth, 
                    sub_participants=use_subset, 
                    one_sample=use_one_sample)
fecal_data.drop_alpha_outliers()

fecal_data.drop_bmi_outliers()

In [4]:
def alpha_test(metric, map_, ids):
    alpha = [map_.loc[i, metric] for i in ids]
    return scipy.stats.kruskal(*alpha)[1]

In [5]:
def beta_test(metric, group, ids, permutations=249):
    ids = np.hstack(ids)
    
    beta_p = skbio.stats.distance.permanova(
        distance_matrix=fecal_data.beta[metric].filter(ids),
        grouping=fecal_data.map_.loc[ids],
        column=group.name,
        permutations=permutations,
    )
    return beta_p['p-value']

In [25]:
def generate_summary(group):
    fecal_data.clean_up_column(group)
    
    order = group.order
        
    results = {'name': group.name,
               'groups': order,
               'extremes': group.extremes}
    
    map_, otu_, beta = fecal_data.return_dataset(group)
    
    grouped = map_.groupby(group.name).groups
    group_ids = [grouped[o] for o in order]
    extreme_ids = [grouped[o] for o in group.extremes]

    for metric in ['PD_whole_tree_10k', 'shannon_10k']:
        results['%s_p_all' % metric] = alpha_test(metric, map_, group_ids)
        results['%s_p_extreme' % metric] = alpha_test(metric, map_, extreme_ids)

        a_power, a_counts = subsample_power(
                    test=lambda x: alpha_test(metric, map_, x),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )
        with open(os.path.join(save_dir, 'all_power/%s/%s.p' % (metric, group)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, a_power, a_counts), f_)
        results['%s_eff_all' % metric] = (z_effect(a_counts, a_power).mean(),
                                          confidence_bound(z_effect(a_counts, a_power)))

        e_power, e_counts = subsample_power(
                test=lambda x: alpha_test(metric, map_, x),
                samples=extreme_ids,
                min_counts=5,
                counts_interval=10,
                max_counts=50,
                num_runs=5,
                num_iter=500,
                )
        with open(os.path.join(save_dir, 'extreme_power/%s/%s.p' % (metric, group)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, a_power, a_counts), f_)
        results['%s_eff_ext' % metric] = (z_effect(e_counts, e_power).mean(), 
                                          confidence_bound(z_effect(e_counts, e_power)))

    for metric in ['unweighted_unifrac', 'weighted_unifrac']:
        results['%s_unifrac_p_all' % metric] = beta_test(metric, group, group_ids, 999)
        results['%s_unifrac_p_extreme' % metric] = beta_test(metric, group, extreme_ids, 999)

        a_power, a_counts = subsample_power(
                    test=lambda x: beta_test(metric, group, group_ids, 99),
                    samples=group_ids,
                    min_counts=10,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )
        with open(os.path.join(save_dir, 'all_power/%s_unifrac/%s.p' % (metric, group)), 'w') as f_:
            pickle.dump((name, order, metric, a_power, a_counts), f_)
        results['%s_unifrac_eff_all' % metric] = (z_effect(a_counts, a_power).mean(),
                                                  confidence_bound(z_effect(a_counts, a_power)))

        e_power, e_counts = subsample_power(
                test=lambda x: beta_test(metric, group, group_ids, 99),
                samples=extreme_ids,
                min_counts=10,
                counts_interval=10,
                max_counts=50,
                num_runs=5,
                num_iter=500,
                )
        with open(os.path.join(save_dir, 'extreme_power/%s_unifrac/%s.p' % (metric, group)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, a_power, a_counts), f_)

        results['%s_unifrac_eff_ext' % metric] = (z_effect(e_counts, e_power).mean(), 
                                                  confidence_bound(z_effect(e_counts, e_power)))
        
    with open(os.path.join(save_dir, 'summary/%s.p' % group), 'w') as f_:
        pickle.dump((results))

In [None]:
%%timeit
generate_summary(ag_data_dictionary['AGE_CAT'])

In [None]:
# for name, group in data_dictionary.iteritems():
#     if group.type == 'AgContinous':
#         continue
name = 'AGE_CAT'
group = data_dictionary['AGE_CAT']

if group.type in {'Ordinal', 'Frequency'}:
    order = group.order
elif group.type in {'Categorical', 'Bool', 'Clinical'}:
    order = list(group.groups)
    
results = {'name': group.name,
           'groups': order,
           'extremes': group.extremes}
map_, otu_, beta = fecal_data.return_dataset(group)

grouped = map_.groupby(group.name).groups
group_ids = [grouped[o] for o in order]
extreme_ids = [grouped[o] for o in group.extremes]

In [15]:
save_dir = '/Users/jwdebelius/Desktop/ag_summary/'

In [None]:
os.path.exists(os.path.join(save_dir, 'all_power/'))

In [None]:
help(beta_test)

In [None]:
beta.keys()

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline

In [22]:
import numpy as np

import scipy.stats

from statsmodels.stats.power import FTestAnovaPower
from scipy.stats import norm as z
ft = FTestAnovaPower()


def extrapolate_f(counts, pwr_, alpha=0.05):
    """Converts emperical power to extrapolated

    Parameters
    ----------
    counts : array
        The number of observations which should be used in the final power
        result.
    pwr_ : array
        The observed power. Each column corresponds to the number of
        observations used in `cnts`. The rows correspond to different runs
    cnts : array
        The number of observations drawn to calculate the observed power.
    alpha : float, optional
        The critical value for power calculations.

    Returns
    -------
    power : array
        The extrapolated power for the number of observations given by `counts`

    """
    # Gets the average emperical effect size
    effs = np.zeros(pwr_.shape) * np.nan
    for idx, pwr in enumerate(pwr_):
        for idy, cnt in enumerate(counts):
            try:
                effs[idx, idy] = ft.solve_power(None, cnt, alpha, pwr[idy])
            except:
                pass
    return effs
#     eff_mean = np.nanmean(effs)
    # Calculates the extrapolated power curve
#     extr_pwr = ft.solve_power(effect_size=eff_mean,
#                               nobs=counts,
#                               alpha=0.05,
#                               power=None)

#     return extr_pwr

def z_effect(counts, power, alpha=0.05):
    """Estimates the effect size for power based on the z distribution

    This is based on the equations in
        Lui, X.S. (2014) *Statistical power analysis for the social and
        behavioral sciences: basic and advanced techniques.* New York:
        Routledge. 378 pg.
    The equation assumes a positive magnitude to the effect size and a
    two-tailed test.

    Parameters
    ----------
    counts : array
        The number of observations for each power depth
    power : array
        The statistical power at the depth specified by `counts`
    alpha : float
        The critial value used to calculate the power

    Returns
    effect : array
        T A standard measure of the difference between the underlying
        populations
    """
    z_diff = z.ppf(power) + z.ppf(1 - alpha / 2)
    eff = np.sqrt(2 * np.square(z_diff) / counts)
    eff = eff[np.isinf(eff) == False]
    return eff


def z_power(counts, eff, alpha=0.05):
    """Estimates power for a z distribution from an effect size

    This is based on the equations in
        Lui, X.S. (2014) *Statistical power analysis for the social and
        behavioral sciences: basic and advanced techniques.* New York:
        Routledge. 378 pg.
    The equation assumes a positive magnitude to the effect size and a
    two-tailed test.

    Parameters
    ----------
    counts : array
        The number of observations for each power depth
    effect : float
        A standard measure of the difference between the underlying populations
     alpha : float
        The critial value used to calculate the power

    Returns
    power : array
        The statistical power at the depth specified by `counts`

    """
    power = ((z.cdf(eff * np.sqrt(counts/2) - z.ppf(1 - alpha/2)) +
             (z.cdf(z.ppf(alpha/2) - eff * np.sqrt(counts/2)))))
    return power


In [None]:
help(skbio.stats.distance.permanova)

In [None]:
map_.groupby('AGE_CAT').groups

In [None]:
results = {}
partition_samples = [
for a_metric in alpha_metrics:
    results['%s_p' % a_metric] = alpha_test(a_metric, group)
    results['%s_power' % a_metric] = subsample_power(
        test=lambda x: alpha_test(metric, group, x),
        
    )

In [None]:
map_.groupby(group.name).groups.values()

In [None]:
def alpha_test(metric, ids):
    alpha = [map_.loc[i, metric] for i in ids]
    return scipy.stats.kruskal(*alpha)[1]

In [None]:
from skbio.stats.power import confidence_bound

In [None]:
help(confidence_bound)