In [1]:
from matplotlib import use
use('Agg')  #noqa

In [26]:
import os
import pickle

from multiprocessing import Pool

import numpy as np
import pandas as pd
import scipy
from scipy.stats import norm as z
import skbio

from skbio.stats.power import subsample_power, confidence_bound

from americangut.ag_data import AgData, ag_data_dictionary

In [3]:
BODYSITE = 'fecal'
TRIM = '100nt'
DEPTH = '10k'
ONE_SAMPLE = True

In [75]:
def _alpha_test(metric, map_, ids):
    """Tests alpha diversity with a krusal wallis test"""
    alpha = [map_.loc[i, metric] for i in ids]
    return scipy.stats.kruskal(*alpha)[1]

In [76]:
def _beta_test(metric, group, data, ids, permutations=249):
    """Tests beta diversity with a permanova"""
    ids = np.hstack(ids)
    beta_p = skbio.stats.distance.permanova(
        distance_matrix=data.beta[metric].filter(ids),
        grouping=data.map_.loc[ids],
        column=group.name,
        permutations=permutations,
    )
    return beta_p['p-value']

In [6]:
def z_effect(counts, power, alpha=0.05):
    """Estimates the effect size for power based on the z distribution

    This is based on the equations in
        Lui, X.S. (2014) *Statistical power analysis for the social and
        behavioral sciences: basic and advanced techniques.* New York:
        Routledge. 378 pg.
    The equation assumes a positive magnitude to the effect size and a
    two-tailed test.

    Parameters
    ----------
    counts : array
        The number of observations for each power depth
    power : array
        The statistical power at the depth specified by `counts`
    alpha : float
        The critial value used to calculate the power

    Returns
    effect : array
        T A standard measure of the difference between the underlying
        populations
    """
    z_diff = z.ppf(power) + z.ppf(1 - alpha / 2)
    eff = np.sqrt(2 * np.square(z_diff) / counts)
    eff = eff[np.isinf(eff) == False]
    return eff

In [12]:
group = ag_data_dictionary['AGE_CAT']
subset = False
save_dir = '/Users/jwdebelius/Desktop/test/'

In [9]:
%%time 
data = AgData(bodysite=BODYSITE,
              trim=TRIM,
              depth=DEPTH,
              one_sample=ONE_SAMPLE,
              sub_participants=subset,
              )

CPU times: user 7.55 s, sys: 333 ms, total: 7.88 s
Wall time: 7.97 s


In [11]:
%%time
data.drop_alpha_outliers()
data.drop_bmi_outliers()

CPU times: user 944 ms, sys: 137 ms, total: 1.08 s
Wall time: 1.08 s


In [14]:
%%time
data.clean_group(group)

CPU times: user 466 ms, sys: 33.7 ms, total: 500 ms
Wall time: 493 ms


In [16]:
%%time
results = {'name': group.name,
           'groups': group.order,
           'extremes': group.extremes}

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 12.2 µs


In [20]:
%%time
map_, otu_, beta = data.return_dataset(group)

CPU times: user 1.14 s, sys: 60.7 ms, total: 1.2 s
Wall time: 1.2 s


In [22]:
%%time
grouped = data.map_.groupby(group.name).groups
group_ids = [grouped[o] for o in group.order]
extreme_ids = [grouped[o] for o in group.extremes]

CPU times: user 613 µs, sys: 34 µs, total: 647 µs
Wall time: 630 µs


In [23]:
metric = 'PD_whole_tree_10k'

In [27]:
%%time
results['%s_p_all' % metric] = _alpha_test(metric, map_, group_ids)

CPU times: user 7.64 ms, sys: 1.12 ms, total: 8.76 ms
Wall time: 7.86 ms


In [29]:
%%time
results['%s_p_extreme' % metric] = _alpha_test(metric, map_,
                                                      extreme_ids)

CPU times: user 3 ms, sys: 500 µs, total: 3.5 ms
Wall time: 3.02 ms


In [34]:
%%time
 a_power, a_counts = subsample_power(
                    test=lambda x: _alpha_test(metric, data.map_, x),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )

CPU times: user 46 s, sys: 106 ms, total: 46.1 s
Wall time: 46.1 s


In [36]:
%%time
a_eff = z_effect(a_counts, a_power)
results['%s_eff_all' % metric] = a_eff.mean()
results['%s_eff_lo' % metric] = a_eff.mean() - confidence_bound(a_eff)
results['%s_eff_hi' % metric] = a_eff.mean() + confidence_bound(a_eff)

CPU times: user 1.48 ms, sys: 696 µs, total: 2.17 ms
Wall time: 6.16 ms


In [38]:
%%time
e_power, e_counts = subsample_power(
                    test=lambda x: _alpha_test(metric, data.map_, x),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )

CPU times: user 46.9 s, sys: 158 ms, total: 47 s
Wall time: 47.1 s


In [40]:
%%time
e_eff = z_effect(a_counts, a_power)
results['%s_eff_exe' % metric] = e_eff.mean()
results['%s_eff_exe_lo' % metric] = e_eff.mean() - \
    confidence_bound(e_eff)
results['%s_eff_exe_hi' % metric] = e_eff.mean() + \
    confidence_bound(e_eff)

CPU times: user 1.27 ms, sys: 270 µs, total: 1.54 ms
Wall time: 1.35 ms


In [41]:
metric = 'unweighted_unifrac'

In [45]:
%time results['%s_unifrac_p_all' % metric] = _beta_test(metric, group, data, group_ids, 999)
%time results['%s_unifrac_p_extreme' % metric] = _beta_test(metric, group, data, extreme_ids, 999)

CPU times: user 2min 28s, sys: 59.9 s, total: 3min 28s
Wall time: 3min 28s
CPU times: user 19.6 s, sys: 4.43 s, total: 24 s
Wall time: 24.1 s


In [48]:
%%time
a_power, a_counts = subsample_power(
                    test=lambda x: _beta_test(metric, group, data, x, 99),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=99,
                    )

CPU times: user 2min 34s, sys: 2.11 s, total: 2min 36s
Wall time: 2min 36s


In [49]:
%%time
a_power, a_counts = subsample_power(
                    test=lambda x: _beta_test(metric, group, data, x, 99),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )

CPU times: user 13min 6s, sys: 10.3 s, total: 13min 16s
Wall time: 13min 19s


In [50]:
%%time
a_eff = z_effect(a_counts, a_power)
results['%s_eff_all' % metric] = a_eff.mean()
results['%s_eff_all_lo' % metric] = a_eff.mean() - \
    confidence_bound(a_eff)
results['%s_eff_all_hi' % metric] = a_eff.mean() + \
    confidence_bound(a_eff)

CPU times: user 1.58 ms, sys: 381 µs, total: 1.96 ms
Wall time: 1.69 ms


In [53]:
%%time
e_power, e_counts = subsample_power(
                    test=lambda x: _beta_test(metric, group, data, x, 99),
                    samples=extreme_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )

CPU times: user 5min 4s, sys: 3.06 s, total: 5min 7s
Wall time: 5min 8s


In [54]:
%%time
e_eff = z_effect(a_counts, a_power)
results['%s_eff_exe' % metric] = e_eff.mean()
results['%s_eff_exe_lo' % metric] = e_eff.mean() - \
    confidence_bound(e_eff)
results['%s_eff_exe_hi' % metric] = e_eff.mean() + \
    confidence_bound(e_eff)

CPU times: user 1.68 ms, sys: 411 µs, total: 2.09 ms
Wall time: 1.84 ms


In [56]:
%%time
if not os.path.exists(os.path.join(save_dir, 'summary')):
    os.makedirs(os.path.join(save_dir, 'summary'))
for metric in ['PD_whole_tree_10k', 'shannon_10k', 'unweighted_unifrac',
               'weighted_unifrac']:
    if not os.path.exists(os.path.join(save_dir, 'all_power/%s') % metric):
        os.makedirs(os.path.join(save_dir, 'all_power/%s') % metric)
    if not os.path.exists(
            os.path.join(save_dir, 'extreme_power/%s' % metric)):
        os.makedirs(os.path.join(save_dir, 'extreme_power/%s') % metric)

CPU times: user 406 µs, sys: 1.15 ms, total: 1.55 ms
Wall time: 1.14 ms


In [61]:
%%time
for metric in ['PD_whole_tree_10k', 'shannon_10k']:
    results['%s_p_all' % metric] = _alpha_test(metric, map_, group_ids)
    results['%s_p_extreme' % metric] = _alpha_test(metric, map_,
                                                  extreme_ids)

    a_power, a_counts = subsample_power(
                test=lambda x: _alpha_test(metric, data.map_, x),
                samples=group_ids,
                min_counts=5,
                counts_interval=10,
                max_counts=50,
                num_runs=5,
                num_iter=500,
                )
    with open(os.path.join(save_dir, 'all_power/%s/%s.p'
                           % (metric, group.name)), 'w') as f_:
        pickle.dump((group.name, group.order, metric, a_power, a_counts),
                    f_)
    a_eff = z_effect(a_counts, a_power)
    results['%s_eff_all' % metric] = a_eff.mean()
    results['%s_eff_all_lo' % metric] = a_eff.mean() - \
        confidence_bound(a_eff)
    results['%s_eff_all_hi' % metric] = a_eff.mean() + \
        confidence_bound(a_eff)

    e_power, e_counts = subsample_power(
                test=lambda x: _alpha_test(metric, data.map_, x),
                samples=group_ids,
                min_counts=5,
                counts_interval=10,
                max_counts=50,
                num_runs=5,
                num_iter=500,
                )
    with open(os.path.join(save_dir, 'extreme_power/%s/%s.p'
                           % (metric, group.name)), 'w') as f_:
        pickle.dump((group.name, group.order, metric, a_power, a_counts),
                    f_)
    e_eff = z_effect(a_counts, a_power)
    results['%s_eff_exe' % metric] = e_eff.mean()
    results['%s_eff_exe_lo' % metric] = e_eff.mean() - \
        confidence_bound(e_eff)
    results['%s_eff_exe_hi' % metric] = e_eff.mean() + \
        confidence_bound(e_eff)

CPU times: user 3min 8s, sys: 750 ms, total: 3min 8s
Wall time: 3min 9s


In [63]:
%%time
    for metric in ['unweighted_unifrac', 'weighted_unifrac']:
        results['%s_unifrac_p_all' % metric] = \
            _beta_test(metric, group, data, group_ids, 999)
        results['%s_unifrac_p_extreme' % metric] = \
            _beta_test(metric, group, data, extreme_ids, 999)

        a_power, a_counts = subsample_power(
                    test=lambda x: _beta_test(metric, group, data, x, 99),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )
        with open(os.path.join(save_dir, 'all_power/%s/%s.p'
                               % (metric, group.name)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, a_power, a_counts),
                        f_)

        a_eff = z_effect(a_counts, a_power)
        results['%s_eff_all' % metric] = a_eff.mean()
        results['%s_eff_all_lo' % metric] = a_eff.mean() - \
            confidence_bound(a_eff)
        results['%s_eff_all_hi' % metric] = a_eff.mean() + \
            confidence_bound(a_eff)

        e_power, e_counts = subsample_power(
                    test=lambda x: _beta_test(metric, group, data, x, 99),
                    samples=extreme_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )
        with open(os.path.join(save_dir, 'extreme_power/%s/%s.p'
                               % (metric, group.name)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, e_power, e_counts),
                        f_)

        e_eff = z_effect(a_counts, a_power)
        results['%s_eff_exe' % metric] = e_eff.mean()
        results['%s_eff_exe_lo' % metric] = e_eff.mean() - \
            confidence_bound(e_eff)
        results['%s_eff_exe_hi' % metric] = e_eff.mean() + \
            confidence_bound(e_eff)


CPU times: user 42min 38s, sys: 3min 4s, total: 45min 43s
Wall time: 1h 10min 22s


In [77]:
def generate_summary(group, subset, save_dir):
    """..."""
    if not os.path.exists(os.path.join(save_dir, 'summary')):
        os.makedirs(os.path.join(save_dir, 'summary'))
    for metric in ['PD_whole_tree_10k', 'shannon_10k', 'unweighted_unifrac',
                   'weighted_unifrac']:
        if not os.path.exists(os.path.join(save_dir, 'all_power/%s') % metric):
            os.makedirs(os.path.join(save_dir, 'all_power/%s') % metric)
        if not os.path.exists(
                os.path.join(save_dir, 'extreme_power/%s' % metric)):
            os.makedirs(os.path.join(save_dir, 'extreme_power/%s') % metric)

    # Loads the dataset
    data = AgData(bodysite=BODYSITE,
                  trim=TRIM,
                  depth=DEPTH,
                  one_sample=ONE_SAMPLE,
                  sub_participants=subset,
                  )
    data.drop_alpha_outliers()
    data.drop_bmi_outliers()
    # Cleans up the column of interst
    data.clean_group(group)

    results = {'name': group.name,
               'groups': group.order,
               'extremes': group.extremes}

    grouped = map_.groupby(group.name).groups
    group_ids = [grouped[o] for o in group.order]
    extreme_ids = [grouped[o] for o in group.extremes]

    for metric in ['PD_whole_tree_10k', 'shannon_10k']:
        results['%s_p_all' % metric] = _alpha_test(metric, map_, group_ids)
        results['%s_p_extreme' % metric] = _alpha_test(metric, map_,
                                                       extreme_ids)

        a_power, a_counts = subsample_power(
                    test=lambda x: _alpha_test(metric, data.map_, x),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )
        with open(os.path.join(save_dir, 'all_power/%s/%s.p'
                               % (metric, group.name)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, a_power, a_counts),
                        f_)
        a_eff = z_effect(a_counts, a_power)
        results['%s_eff_all' % metric] = a_eff.mean()
        results['%s_eff_all_lo' % metric] = a_eff.mean() - \
            confidence_bound(a_eff)
        results['%s_eff_all_hi' % metric] = a_eff.mean() + \
            confidence_bound(a_eff)

        e_power, e_counts = subsample_power(
                    test=lambda x: _alpha_test(metric, data.map_, x),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )
        with open(os.path.join(save_dir, 'extreme_power/%s/%s.p'
                               % (metric, group.name)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, e_power, e_counts),
                        f_)
        e_eff = z_effect(a_counts, a_power)
        results['%s_eff_exe' % metric] = e_eff.mean()
        results['%s_eff_exe_lo' % metric] = e_eff.mean() - \
            confidence_bound(e_eff)
        results['%s_eff_exe_hi' % metric] = e_eff.mean() + \
            confidence_bound(e_eff)

    for metric in ['unweighted_unifrac', 'weighted_unifrac']:
        results['%s_unifrac_p_all' % metric] = \
            _beta_test(metric, group, data, group_ids, 999)
        results['%s_unifrac_p_extreme' % metric] = \
            _beta_test(metric, group, data, extreme_ids, 999)

        a_power, a_counts = subsample_power(
                    test=lambda x: _beta_test(metric, group, data, x, 99),
                    samples=group_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )
        with open(os.path.join(save_dir, 'all_power/%s/%s.p'
                               % (metric, group.name)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, a_power, a_counts),
                        f_)

        a_eff = z_effect(a_counts, a_power)
        results['%s_eff_all' % metric] = a_eff.mean()
        results['%s_eff_all_lo' % metric] = a_eff.mean() - \
            confidence_bound(a_eff)
        results['%s_eff_all_hi' % metric] = a_eff.mean() + \
            confidence_bound(a_eff)

        e_power, e_counts = subsample_power(
                    test=lambda x: _beta_test(metric, group, data, x, 99),
                    samples=extreme_ids,
                    min_counts=5,
                    counts_interval=10,
                    max_counts=50,
                    num_runs=5,
                    num_iter=500,
                    )
        with open(os.path.join(save_dir, 'extreme_power/%s/%s.p'
                               % (metric, group.name)), 'w') as f_:
            pickle.dump((group.name, group.order, metric, e_power, e_counts),
                        f_)

        e_eff = z_effect(a_counts, a_power)
        results['%s_eff_exe' % metric] = e_eff.mean()
        results['%s_eff_exe_lo' % metric] = e_eff.mean() - \
            confidence_bound(e_eff)
        results['%s_eff_exe_hi' % metric] = e_eff.mean() + \
            confidence_bound(e_eff)







# pool = Pool(32)


In [78]:
%time generate_summary(group, subset, save_dir)

CPU times: user 45min 56s, sys: 2min 48s, total: 48min 45s
Wall time: 49min


In [82]:
len(ag_data_dictionary.keys())/5

10

In [89]:
sorted(ag_data_dictionary.keys())[45:]

['SLEEP_DURATION',
 'SMOKING_FREQUENCY',
 'SUGARY_SWEETS_FREQUENCY',
 'TYPES_OF_PLANTS',
 'VEGETABLE_FREQUENCY',
 'VITAMIN_B_SUPPLEMENT_FREQUENCY',
 'VITAMIN_D_SUPPLEMENT_FREQUENCY',
 'WEIGHT_CHANGE']