In [42]:


import numpy as np
import skbio
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sms

import matplotlib.pyplot as plt
% matplotlib inline

import americangut.ag_dictionary as agdic
import americangut.diversity_analysis as agdiv
import americangut.notebook_environment as agenv
from americangut.ag_data import AgData

In [2]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = False
use_one_sample = True

In [3]:
fecal_data = AgData(bodysite=bodysite, 
                    trim=sequence_trim, 
                    depth=rarefaction_depth, 
                    sub_participants=use_subset, 
                    one_sample=use_one_sample)

In [4]:
fecal_data.drop_alpha_outliers()
fecal_data.drop_bmi_outliers()
fecal_data.clean_age()

In [7]:
vios_skip = {'VIOSCREEN_FINISHED',
             'VIOSCREEN_DATABASE',
             'VIOSCREEN_MULTIVITAMIN',
             'VIOSCREEN_QUESTIONNAIRE',
             'VIOSCREEN_GENDER',
             'VIOSCREEN_SCFV',
             'VIOSCREEN_PROTOCOL',
             'VIOSCREEN_CALCIUM_FREQ',
             'VIOSCREEN_CALCIUM',
             'VIOSCREEN_STARTED',
             'VIOSCREEN_NUTRIENT_RECOMMENDATION',
             'VIOSCREEN_PROCDATE',
             'VIOSCREEN_DOB',
             'VIOSCREEN_SCF',
             'VIOSCREEN_MULTIVITAMIN_FREQ'}
vios_cols = [x for x in fecal_data.map_.columns if 'VIOSCREEN' in x and not x in vios_skip]
for col in vios_cols:
    fecal_data.map_[col] = fecal_data.map_[col].astype(float)

In [29]:
vios_map = fecal_data.map_.loc[fecal_data.map_.VIOSCREEN_FINISHED.dropna().index, vios_cols]
vios_map.replace(np.nan, 0, inplace=True)

In [45]:
vios_cols[0]

'VIOSCREEN_MANNITOL'

In [20]:
vios_otus = pd.DataFrame(np.vstack([fecal_data.otu_.data(id_, 'observation') 
                                    for id_ in fecal_data.otu_.ids('observation')]),
                         columns=fecal_data.otu_.ids('sample'), 
                         index=fecal_data.otu_.ids('observation')).transpose()

In [30]:
vios_map.shape

(326, 250)

In [32]:
vios_drop = (vios_otus.sum(0) > 0.1*vios_map.shape[0]) & (vios_otus.sum(0) < 0.9*vios_map.shape[0])

(3867, 23060)

In [49]:
vios_sub = (vios_otus.loc[vios_map.index, vios_drop.loc[vios_drop].index] > 0).astype(int)

In [44]:
vios_sub.columns[0]

u'1081058'

In [52]:
mannitol_fits = {}
for otu_id in vios_sub.columns:
    try:
        model = sms.Logit(vios_sub[otu_id], sms.add_constant(vios_map['VIOSCREEN_MANNITOL'])).fit()
        mannitol_fits[otu_id] = {'model': model,
                                 'p-value': model.llr_pvalue,
                                 'pseudo-r2': model.prsquared,
                                 'OR': 
                                 
                                 }
    except:
        mannitol_fits[otu_id] = 'no fit'

Optimization terminated successfully.
         Current function value: 0.051710
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.103619
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.051971
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.037342
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.091147
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.079233
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.014932
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.034170
         Iterations 11
Optimization terminated successfully.
         Current function value: 0.019054
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.228605

In [72]:
model.conf_int().rename(columns={0: 'lower', 1: 'upper'}).apply(lambda x: np.exp(x))

Unnamed: 0,lower,upper
const,0.019905,0.170052
VIOSCREEN_MANNITOL,0.02231,3.362067


In [56]:
model = mannitol_fits['274619']

In [59]:
print mannitol_fits['274619'].summary()

                           Logit Regression Results                           
Dep. Variable:                 274619   No. Observations:                  326
Model:                          Logit   Df Residuals:                      324
Method:                           MLE   Df Model:                            1
Date:                Sun, 07 Feb 2016   Pseudo R-squ.:                 0.01220
Time:                        13:24:13   Log-Likelihood:                -47.505
converged:                       True   LL-Null:                       -48.091
                                        LLR p-value:                    0.2787
                         coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------
const                 -2.8442      0.547     -5.197      0.000        -3.917    -1.772
VIOSCREEN_MANNITOL    -1.2951      1.279     -1.012      0.311        -3.803     1.213


In [17]:
map_[watch_cols].dropna()

Unnamed: 0_level_0,IBD,SEX,ANTIBIOTIC_HISTORY,COUNTRY,RACE,BOWEL_MOVEMENT_QUALITY,BOWEL_MOVEMENT_FREQUENCY,AGE_CORRECTED,VIOSCREEN_MANNITOL,VIOSCREEN_SFA100,...,VIOSCREEN_MALTOSE,VIOSCREEN_HEI_OILS,VIOSCREEN_JOULES,VIOSCREEN_FORMONTN,VIOSCREEN_SWEET_SERVINGS,VIOSCREEN_CHOLEST,VIOSCREEN_METHHIS3,VIOSCREEN_NATOCO,PD_whole_tree_10k,lnAGE
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10317.000010218,(0) No,(0) female,(3) More than a year,(0) USA,(1) Caucasian,(1) Constipated,(0) Less than one,54,0.915014,1.050411,...,8.222494,10.0,13781.92877,0.002192,0.59344,420.087414,7.668383,20.822109,35.700206,3.988984
10317.000031464,(0) No,(0) female,(2) Year,(0) USA,(1) Caucasian,(2) Diarrhea,(3) Three,63,0.438356,0.502438,...,2.987973,10.0,7581.827397,0.001973,0.95342,240.025771,19.297508,11.613071,37.246311,4.143135
10317.000028712,(0) No,(0) female,(1) 6 months,(0) USA,(1) Caucasian,(0) Normal,(1) One,54,0.265206,0.682712,...,3.029315,10.0,7879.437671,0.003808,0.70137,264.724358,8.61937,9.106273,29.965419,3.988984
10317.000031298,(0) No,(0) female,(3) More than a year,(0) USA,(1) Caucasian,(0) Normal,(1) One,55,0.202329,0.220164,...,2.085452,10.0,7637.134247,0.00274,0.02555,314.928874,25.388,14.546933,40.717236,4.007333
10317.000012905,(0) No,(0) female,(1) 6 months,(0) USA,(1) Caucasian,(1) Constipated,(0) Less than one,31,0.182685,0.250685,...,2.72526,10.0,5620.835274,0.002384,0.61793,209.869242,15.558493,6.863426,33.996324,3.433987
10317.000028809,(0) No,(0) female,(3) More than a year,(0) USA,(1) Caucasian,(2) Diarrhea,(2) Two,59,0.240767,1.253206,...,7.01989,5.3,9473.146575,0.00326,1.40149,315.375535,12.753179,9.631615,30.297911,4.077537
10317.000013266,(0) No,(0) female,(1) 6 months,(0) USA,(1) Caucasian,(0) Normal,(1) One,72,0.276247,0.149808,...,0.690877,10.0,4527.97226,0.005507,0.0,137.725021,1.107973,9.475945,36.06962,4.276666
10317.000017274,(0) No,(0) female,(2) Year,(0) USA,(1) Caucasian,(0) Normal,(1) One,62,0.525671,0.623616,...,2.38126,9.4,9096.455479,0.003178,0.0,659.170462,16.150685,13.076192,34.409303,4.127134
10317.000011281,(0) No,(0) female,(3) More than a year,(0) USA,(1) Caucasian,(0) Normal,(2) Two,47,0.585836,0.132384,...,2.480027,6.5,4450.930822,0.00274,0.0,117.846993,16.050082,6.766905,45.989492,3.850148
10317.000026662,(0) No,(1) male,(3) More than a year,(0) USA,(1) Caucasian,(2) Diarrhea,(3) Three,49,0.241699,0.455699,...,1.55337,10.0,5721.994178,0.000575,0.43711,161.201595,4.212055,9.289261,25.260554,3.89182


In [18]:
age_ = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED', data=map_[watch_cols]).fit()
lage = smf.ols('PD_whole_tree_10k ~ lnAGE', data=map_[watch_cols]).fit()

In [19]:
print age_.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     62.42
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           3.71e-15
Time:                        12:57:32   Log-Likelihood:                -11093.
No. Observations:                3426   AIC:                         2.219e+04
Df Residuals:                    3424   BIC:                         2.220e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------
Intercept        28.4180      0.382     74.326

In [20]:
print lage.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     71.43
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           4.20e-17
Time:                        12:57:33   Log-Likelihood:                -11088.
No. Observations:                3426   AIC:                         2.218e+04
Df Residuals:                    3424   BIC:                         2.219e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     20.2344      1.316     15.375      0.0

In [21]:
map_.groupby('SEX').count().max(1)

SEX
(0) female    1794
(1) male      1466
dtype: int64

In [22]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.023
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                     25.02
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           5.33e-16
Time:                        12:57:45   Log-Likelihood:                -10555.
No. Observations:                3260   AIC:                         2.112e+04
Df Residuals:                    3256   BIC:                         2.114e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                21.51

In [23]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.059
Model:                            OLS   Adj. R-squared:                  0.057
Method:                 Least Squares   F-statistic:                     33.27
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           4.04e-39
Time:                        12:57:51   Log-Likelihood:                -10312.
No. Observations:                3208   AIC:                         2.064e+04
Df Residuals:                    3201   BIC:                         2.068e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------
Intercept     

In [24]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.076
Model:                            OLS   Adj. R-squared:                  0.074
Method:                 Least Squares   F-statistic:                     35.03
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           2.89e-47
Time:                        12:57:57   Log-Likelihood:                -9557.5
No. Observations:                2989   AIC:                         1.913e+04
Df Residuals:                    2981   BIC:                         1.918e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------
Intercept     

In [25]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.086
Model:                            OLS   Adj. R-squared:                  0.083
Method:                 Least Squares   F-statistic:                     30.26
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           5.96e-51
Time:                        12:58:30   Log-Likelihood:                -9238.2
No. Observations:                2896   AIC:                         1.850e+04
Df Residuals:                    2886   BIC:                         1.856e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------

In [26]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE + ANTIBIOTIC_HISTORY', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.114
Method:                 Least Squares   F-statistic:                     31.78
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           2.17e-69
Time:                        12:59:09   Log-Likelihood:                -9122.7
No. Observations:                2875   AIC:                         1.827e+04
Df Residuals:                    2862   BIC:                         1.835e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------

In [27]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE + COUNTRY + IBD + RACE + ANTIBIOTIC_HISTORY', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.107
Method:                 Least Squares   F-statistic:                     37.35
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           1.15e-69
Time:                        12:59:47   Log-Likelihood:                -9596.9
No. Observations:                3022   AIC:                         1.922e+04
Df Residuals:                    3011   BIC:                         1.928e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------

In [28]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE + ANTIBIOTIC_HISTORY + BOWEL_MOVEMENT_QUALITY', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.109
Method:                 Least Squares   F-statistic:                     12.09
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           4.79e-25
Time:                        13:00:28   Log-Likelihood:                -3780.1
No. Observations:                1184   AIC:                             7588.
Df Residuals:                    1170   BIC:                             7659.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------

In [29]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE + COUNTRY + IBD + RACE + ANTIBIOTIC_HISTORY + BOWEL_MOVEMENT_QUALITY', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.115
Model:                            OLS   Adj. R-squared:                  0.107
Method:                 Least Squares   F-statistic:                     13.95
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           1.71e-25
Time:                        13:01:13   Log-Likelihood:                -3796.7
No. Observations:                1189   AIC:                             7617.
Df Residuals:                    1177   BIC:                             7678.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------

In [30]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + ANTIBIOTIC_HISTORY + BOWEL_MOVEMENT_QUALITY', data=map_[watch_cols]).fit()
print age2.summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.111
Model:                            OLS   Adj. R-squared:                  0.103
Method:                 Least Squares   F-statistic:                     13.69
Date:                Sun, 07 Feb 2016   Prob (F-statistic):           5.14e-25
Time:                        13:01:29   Log-Likelihood:                -3899.6
No. Observations:                1219   AIC:                             7823.
Df Residuals:                    1207   BIC:                             7884.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------

In [32]:
map_[watch_cols].groupby(('IBD', 'BOWEL_MOVEMENT_QUALITY')).count().max(1)

IBD     BOWEL_MOVEMENT_QUALITY
(0) No  (0) Normal                905
        (1) Constipated           176
        (2) Diarrhea              184
dtype: int64

In [24]:
questions

['IBD',
 'SEX',
 'ANTIBIOTIC_HISTORY',
 'COUNTRY',
 'RACE',
 'BOWEL_MOVEMENT_QUALITY',
 'BOWEL_MOVEMENT_FREQUENCY',
 'AGE_CORRECTED',
 'VIOSCREEN_MANNITOL',
 'VIOSCREEN_SFA100',
 'VIOSCREEN_SFA170',
 'VIOSCREEN_VEGETABLE_SERVINGS',
 'VIOSCREEN_SORBITOL',
 'VIOSCREEN_SFA80',
 'VIOSCREEN_BETACAR',
 'VIOSCREEN_LOW_FAT_DAIRY_SERVING',
 'VIOSCREEN_NIACINEQ',
 'VIOSCREEN_GAMMTOCO',
 'VIOSCREEN_LYSINE',
 'VIOSCREEN_ISOMALT',
 'VIOSCREEN_SUCROSE',
 'VIOSCREEN_CLAC9T11',
 'VIOSCREEN_FIBER',
 'VIOSCREEN_LACTITOL',
 'VIOSCREEN_HEI_NON_JUICE_FRT',
 'VIOSCREEN_OXALICM',
 'VIOSCREEN_ALPHACAR',
 'VIOSCREEN_LYCOPENE',
 'VIOSCREEN_VEG5_DAY',
 'VIOSCREEN_HEI_MILK',
 'VIOSCREEN_CLAT10C12',
 'VIOSCREEN_HEI_MEAT_BEANS',
 'VIOSCREEN_HEI_SOL_FAT_ALC_ADD_SUG',
 'VIOSCREEN_COPPER',
 'VIOSCREEN_HEI2010_REFINED_GRAINS',
 'VIOSCREEN_LUTZEAX',
 'VIOSCREEN_SALAD_VEGETABLE_SERVINGS',
 'VIOSCREEN_SFA200',
 'VIOSCREEN_HEI2010_DAIRY',
 'VIOSCREEN_FIBH2O',
 'VIOSCREEN_GLAC',
 'VIOSCREEN_SFA140',
 'VIOSCREEN_TAGATOSE',
 

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED', data=map_.loc[map_.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map = map_.loc[map_[['AGE_CORRECTED', 'VIOSCREEN_FIBER']].dropna().index]
sub_map['COUNTRY'] = sub_map.COUNTRY.apply(lambda x: x if x in {'USA', 'United Kingdom'} else np.nan)

In [None]:
f_ = lambda x: x if x in {'USA', 'United Kingdom'} else np.nan

In [None]:
abx = agdic.ag_dictionary('ANTIBIOTIC_HISTORY')
abx.remap_groups(sub_map)
abx.label_order(sub_map)

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map['lnAge'] = sub_map.AGE_CORRECTED.apply(lambda x: np.log(x))
sub_map.loc[sub_map.AGE_CORRECTED < 20, 'lnAge'] = np.nan

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge + COUNTRY', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge + COUNTRY + ANTIBIOTIC_HISTORY', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED + COUNTRY + VIOSCREEN_HEI_SCORE', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map.loc[sub_map.AGE_CORRECTED > 20].shape

In [None]:
plt.plot(map_['VIOSCREEN_FIBER'], map_['PD_whole_tree_10k'], 'o')

In [None]:
sorted(vios_cols)

In [None]:
vios_cols