In [34]:
from __future__ import division

import os
import matplotlib.pyplot as plt
import numpy as np
import skbio
import scipy.stats
import statsmodels.api as sms
import statsmodels.formula.api as smf

import absloute_power.traditional as trad
import absloute_power.utils as ap


import pandas as pd
import biom

from skbio.stats.power import subsample_power

from absloute_power.traditional import calc_ttest_1

import pickle

In [2]:
!pwd

/Users/jwdebelius/Repositories/Absloute-Power/ipython_notebooks/2015_9


In [3]:
base_dir = '/Users/jwdebelius/Repositories/Absloute-Power'
data_dir = os.path.join(base_dir, 'data/merged_otu_table_and_mapping_bmi')
map_fp = os.path.join(data_dir, 'merged_bmi_mapping_final.txt')
otu_fp = os.path.join(data_dir, 'merged_bmi_otu_1k.biom')
uud_fp = os.path.join(data_dir, '1k/unweighted_unifrac_dm.txt')

In [4]:
map_ = pd.read_csv(map_fp, sep='\t', dtype=str)
map_.set_index('#SampleID', inplace=True)
uud = skbio.DistanceMatrix.read(uud_fp)

In [5]:
def test_beta_permanova(ids, meta, dm, cat, num_iter=499):
    """Tests difference in beta diversity for a category and distance matrix"""
    # Gets the map and distance matrix subset
    all_ids = np.hstack(ids)
    # Calculates the permanova
    perma_results = skbio.stats.distance.permanova(dm.filter(all_ids), 
                                                   meta.loc[all_ids],
                                                   cat,
                                                   num_iter)
    # Returns the p value
    return perma_results['p-value']

In [6]:
map_.columns

Index([u'BarcodeSequence', u'LinkerPrimerSequence', u'BMI',
       u'bmi_group_binned', u'bmi_group_coded', u'original_study',
       u'combined_study_bmi_group', u'merged_category_bmi',
       u'merged_weight_cats_study', u'PCR_PRIMERS', u'TARGET_SUBFRAGMENT',
       u'AGE', u'ELEVATION', u'LONGITUDE', u'COUNTRY', u'SEQUENCING_METH',
       u'SAMPLE_CENTER', u'Description_duplicate', u'ReversePrimer',
       u'COLLECTION_DATE', u'SEX', u'FAMILY_RELATIONSHIP_GG', u'STUDY_CENTER',
       u'EXPERIMENT_CENTER', u'bmi_group_amish', u'RUN_CENTER', u'LATITUDE',
       u'Description'],
      dtype='object')

In [9]:
map_.groupby('TARGET_SUBFRAGMENT').count().max(1)

TARGET_SUBFRAGMENT
V13    111
V2     576
V35     97
V4     273
dtype: int64

In [13]:
map_.groupby(['original_study', 'TARGET_SUBFRAGMENT']).count().max(1)

original_study         TARGET_SUBFRAGMENT
COMBO_Wu               V2                     97
HMP                    V13                   111
                       V35                    97
Turnbaugh_mz_dz_twins  V2                    154
Yatsunenko_GG          V4                    273
amish_Fraser           V2                    325
dtype: int64

In [15]:
map_study_groups = map_.loc[inter_ids].groupby('original_study').groups
v2_ids = [map_study_groups[g] for g in ['COMBO_Wu', 'amish_Fraser', 'Turnbaugh_mz_dz_twins']]

In [17]:
study_test = lambda x: test_beta_permanova(x, map_, uud, 'original_study')
obese_test = lambda x: test_beta_permanova(x, map_, uud, 'bmi_group_coded')

In [14]:
inter_ids = list(set(map_.index).intersection(set(uud.ids)))

In [72]:
study_ids = [ids for ids in map_.loc[inter_ids].groupby('original_study').groups.itervalues()]
obese_ids = [ids for ids in map_.loc[inter_ids].groupby('bmi_group_coded').groups.itervalues()]

In [24]:
v2_power, v2_counts = subsample_power(study_test,
                                      v2_ids,
                                      min_counts=3,
                                      max_counts=8,
                                      counts_interval=1,
                                      alpha_pwr=0.012)

In [25]:
v2_power

array([[ 0.876,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.896,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.916,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.926,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.906,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.898,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.906,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.92 ,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.904,  1.   ,  1.   ,  1.   ,  1.   ],
       [ 0.878,  1.   ,  1.   ,  1.   ,  1.   ]])

In [35]:
with open(os.path.join(data_dir, 'v2.p'), 'wb') as f_:
    pickle.dump([v2_power, v2_counts, {'alpha':0.012}], f_)

In [36]:
hmp_map = map_.loc[inter_ids]

In [40]:
hmp_map = hmp_map.loc[hmp_map.original_study == 'HMP']
hmp_map.groupby('TARGET_SUBFRAGMENT').count().max(1)

TARGET_SUBFRAGMENT
V13    107
V35     94
dtype: int64

In [29]:
eff = ap.z_effect(v2_counts, v2_power, 0.012) 

In [30]:
eff.mean()

3.1140459307141137

In [31]:
eff.std()

0.073483738648768795

In [67]:
study_test(study_ids)

0.001

In [68]:
obese_testst(obese_ids)

0.001

In [79]:
study_power, study_counts = subsample_power(study_test, 
                                            study_ids, 
                                            min_counts=3, 
                                            max_counts=8, 
                                            counts_interval=1,
                                            num_runs=5,
                                            alpha_pwr=0.005)

In [58]:
obese_power, obese_counts = subsample_power(obese_test, 
                                            obese_ids, 
                                            min_counts=10, 
                                            max_counts=100, 
                                            counts_interval=10)

### ids

In [74]:
import pickle
with open(os.path.join(data_dir, 'obese.p'), 'wb') as f_:
    pickle.dump([study_power, study_counts, obese_power, obese_counts], f_)

In [80]:
study_power

array([[ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.]])

In [108]:
baby_eff = ap.z_effect(baby_counts[1:], baby_power[:, 1:]).mean()
baby_sem = ap.z_effect(baby_counts[1:], baby_power[:, 1:]).std() / np.sqrt(np.prod(baby_power[:, :5].shape))

In [116]:
child_eff = ap.z_effect(child_counts[1:], child_power[:, 1:]).mean()
child_sem = ap.z_effect(child_counts[1:], child_power[:, 1:]).std() / np.sqrt(np.prod(child_power[:, :5].shape))

In [None]:
%matplotlib inline
plt.plot(study_counts, study_power.mean(0), 'bo-')
plt.plot(obese_counts, obese_power.mean(0), 'go-')
# plt.plot(np.arange(1, 15, 0.5), ap.z_power(np.arange(1, 15, 0.5), baby_eff))
# plt.fill_between(np.arange(1, 15, 0.5), 
#                  ap.z_power(np.arange(1, 15, 0.5), baby_eff - baby_sem),
#                  ap.z_power(np.arange(1, 15, 0.5), baby_eff + baby_sem))