In [3]:
import os
import pickle

import numpy as np
import scipy.stats
import skbio
import statsmodels.formula.api as smf

from multiprocessing import Pool

from skbio.stats.power import subsample_power, confidence_bound

from americangut.ag_data_dictionary import ag_data_dictionary
from americangut.ag_data import AgData

In [4]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = False
use_one_sample = True

In [5]:
fecal_data = AgData(bodysite=bodysite, 
                    trim=sequence_trim, 
                    depth=rarefaction_depth, 
                    sub_participants=use_subset, 
                    one_sample=use_one_sample)
fecal_data.drop_alpha_outliers()

fecal_data.drop_bmi_outliers()

In [6]:
frequency = ag_data_dictionary['BOWEL_MOVEMENT_FREQUENCY']
frequency.remap_groups(fecal_data.map_)
frequency.label_order(fecal_data.map_)

In [7]:
quality = ag_data_dictionary['BOWEL_MOVEMENT_QUALITY']
quality.remove_ambiguity(fecal_data.map_)

In [8]:
map_ = fecal_data.map_

In [9]:
set(fecal_data.map_['BOWEL_MOVEMENT_QUALITY'])

{nan,
 'I tend to be constipated (have difficulty passing stool)',
 'I tend to have diarrhea (watery stool)',
 'I tend to have normal formed stool',
 'nan'}

In [10]:
quality.remap_groups(fecal_data.map_)

In [11]:
set(fecal_data.map_['BOWEL_MOVEMENT_QUALITY'])

{'Constipated', 'Diarrhea', 'Normal', 'nan'}

In [12]:
quality.order

['Normal', 'Constipated', 'Diarrhea']

In [13]:
quality.label_order(fecal_data.map_)

In [14]:
fit = smf.ols('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + BOWEL_MOVEMENT_QUALITY', data=fecal_data.map_).fit()
print fit.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     7.835
Date:                Sun, 10 Jan 2016   Prob (F-statistic):           2.51e-08
Time:                        22:27:10   Log-Likelihood:                -5012.5
No. Observations:                1539   AIC:                         1.004e+04
Df Residuals:                    1532   BIC:                         1.008e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------

In [15]:
map_['AGE_CORRECTED'] = map_['AGE_CORRECTED'].astype(float)
map_['lnAGE'] = np.log(map_['AGE_CORRECTED'])

In [16]:
fit = smf.ols('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + BOWEL_MOVEMENT_QUALITY + AGE_CORRECTED', data=fecal_data.map_).fit()
print fit.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.069
Model:                            OLS   Adj. R-squared:                  0.065
Method:                 Least Squares   F-statistic:                     15.71
Date:                Sun, 10 Jan 2016   Prob (F-statistic):           5.53e-20
Time:                        22:27:11   Log-Likelihood:                -4798.4
No. Observations:                1488   AIC:                             9613.
Df Residuals:                    1480   BIC:                             9655.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------

In [17]:
fit = smf.ols('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + BOWEL_MOVEMENT_QUALITY + lnAGE', data=fecal_data.map_).fit()
print fit.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.073
Model:                            OLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     16.75
Date:                Sun, 10 Jan 2016   Prob (F-statistic):           2.16e-21
Time:                        22:27:12   Log-Likelihood:                -4795.0
No. Observations:                1488   AIC:                             9606.
Df Residuals:                    1480   BIC:                             9649.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------

In [18]:
map_.loc[map_.SEX == 'other', 'SEX'] = np.nan

In [None]:
fit = smf.ols('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + BOWEL_MOVEMENT_QUALITY + lnAGE * SEX', data=fecal_data.map_).fit()
print fit.summary()

In [None]:
fit = smf.ols('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + BOWEL_MOVEMENT_QUALITY + AGE_CORRECTED * SEX', data=fecal_data.map_).fit()
print fit.summary()

In [None]:
fit = smf.ols('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + BOWEL_MOVEMENT_QUALITY + lnAGE + FRUIT_FREQUENCY', data=fecal_data.map_).fit()
print fit.summary()

In [None]:
fit = smf.ols(('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + '
                                   'BOWEL_MOVEMENT_QUALITY + '
                                   'lnAGE + '
                                   'FRUIT_FREQUENCY +'
                                   'ALCOHOL_FREQUENCY'), 
              data=fecal_data.map_).fit()
print fit.summary()

In [None]:
fit = smf.ols(('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + '
                                   'BOWEL_MOVEMENT_QUALITY + '
                                   'lnAGE + '
                                   'FRUIT_FREQUENCY +'
                                   'ALCOHOL_FREQUENCY +' 
                                   'CHICKENPOX'), 
              data=fecal_data.map_).fit()
print fit.summary()

In [None]:
country = ag_data_dictionary['COUNTRY']
country.drop_infrequent(map_)

In [None]:
country.order = ['USA', 'Canada', 'Australia', 'United Kingdom']
country.label_order(map_)

In [None]:
fit = smf.ols(('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + '
                                   'BOWEL_MOVEMENT_QUALITY + '
                                   'lnAGE + '
                                   'FRUIT_FREQUENCY +'
                                   'ALCOHOL_FREQUENCY +' 
                                   'CHICKENPOX + '
                                   'COUNTRY'), 
              data=fecal_data.map_).fit()
print fit.summary()

In [20]:
groups = ['BOWEL_MOVEMENT_FREQUENCY', 'BOWEL_MOVEMENT_QUALITY', 'FRUIT_FREQUENCY',
          'ALCOHOL_FREQUENCY', 'CHICKENPOX', 'COUNTRY', 'EXERCISE_FREQUENCY',
          'EXERCISE_LOCATION', 'FLOSSING_FREQUENCY', 'LOWGRAIN_DIET_TYPE', 'MIGRAINE',
          'RACE', 'SLEEP_DURATION']

In [None]:
group = ag_data_dictionary['BOWEL_MOVEMENT_FREQUENCY']

In [22]:
group_watch = []
for group_name in groups:
    group = ag_data_dictionary[group_name]
    try:
        group.drop_infrequent(map_)
        group.remove_ambiguity(map_)
    except:
        pass
    if group.type == 'Clinical':
        group.remap_clinical(map_)
    group.convert_to_numeric(map_)

In [27]:
fit = smf.ols('PD_whole_tree_10k ~ BOWEL_MOVEMENT_FREQUENCY + BOWEL_MOVEMENT_QUALITY', data=fecal_data.map_).fit()
#                                    'lnAGE + '
#                                    'FRUIT_FREQUENCY +'
#                                    'ALCOHOL_FREQUENCY +' 
#                                    'CHICKENPOX + '
#                                    'COUNTRY + '
#                                    'EXERCISE_FREQUENCY +'
#                                    'EXERCISE_LOCATION +'
#                'FLOSSING_FREQUENCY + LOWGRAIN_DIET_TYPE + MIGRAINE + RACE + SLEEP_DURATION' 
              
print fit.summary()

ValueError: zero-size array to reduction operation maximum which has no identity