This notebook is yet another attempt at multivariate statitics. We'll start by importing everything and the kitchen sink, in case we need it. 

In [1]:
import numpy as np
import skbio
import scipy

import pandas as pd
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
% matplotlib inline


import americangut.ag_dictionary as agdic
import americangut.diversity_analysis as agdiv
import americangut.notebook_environment as agenv
from americangut.ag_data import AgData



Next, let's select the data set and rarefaction depth we wish to use.

In [2]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = True
use_one_sample = True

Next, we'll load the data, and remove outliers.

In [3]:
data = AgData(bodysite=bodysite, 
                    trim=sequence_trim, 
                    depth=rarefaction_depth, 
                    sub_participants=use_subset, 
                    one_sample=use_one_sample)

data.drop_alpha_outliers()
data.drop_bmi_outliers()
data.clean_age()

Finally, let's pick our response varaible.

In [4]:
metric = 'PD_whole_tree'
response = '%s_%s' % (metric, rarefaction_depth)

Next, we're going to loop through our dataset and clean up the question responses and number them.

In [5]:
for group_name in agdic.dictionary.iterkeys():
    if group_name == 'AGE_CORRECTED':
        continue
    question = agdic.ag_dictionary(group_name)
    if group_name == 'COUNTRY':
        question.order = ['USA', 'Australia', 'Canada', 'United Kingdom']
    data.clean_group(question)
    question.label_order(data.map_)

Let's also add a column for the natural log of age, since this may be a better shape for the relationship with alpha diversity.

In [6]:
data.map_['lnAGE'] = data.map_['AGE_CORRECTED'].apply(lambda x: np.log(x))

In [66]:
columns1 = ['AGE_CORRECTED', 'IBD', ''
           ]

In [147]:
vars1 = [('AGE_CORRECTED', 'lnAGE'),
         'SEX', 
         'COUNTRY',
         'CHICKENPOX',
         'TYPES_OF_PLANTS',
         'ALCOHOL_FREQUENCY',
         'EXERCISE_FREQUENCY',
         'EXERCISE_LOCATION',
         'FLOSSING_FREQUENCY',
         'MIGRAINE',
         'SLEEP_DURATION',
         ]
#          'ALCOHOL_FREQUENCY', 
#          'TYPES_OF_PLANTS',
#          'EXERCISE_FREQUENCY',
#          'FLOSSING_FREQUENCY',
#          'MIGRAINE',
#          'SLEEP_DURATION', 
#         ]
# vars2 = [('AGE_CORRECTED', 'lnAGE'), 'SEX', 'COUNTRY', 'IBD', 'CHICKENPOX',
#          'ALCOHOL_FREQUENCY', 'ANTIBIOTIC_HISTORY', 'IBS',
#          'BOWEL_MOVEMENT_FREQUENCY', 'BOWEL_MOVEMENT_QUALITY',
#          'COUNTRY', 'CSECTION', 'DIABETES', 'DIET_TYPE',
#          'DRINKING_WATER_SOURCE', 'EXERCISE_FREQUENCY', 'EXERCISE_LOCATION',
#          'FLOSSING_FREQUENCY', 'MIGRAINE', 'RACE', 'SEX', 'SLEEP_DURATION', 
#          'SMOKING_FREQUENCY', # 'TYPES_OF_PLANTS',
#         ]

In [148]:
def _equation_builder(vars_, last_eq=None, response=None):
    """Builds a set of equations trying multiple predictor options"""
    # Checks enough variables are defined
    if last_eq is None and response is None:
        raise ValueError('A response or last equation must be specified.')

    # Checks the class of vars
    if isinstance(vars_, str):
        vars_ = [vars_]

    # Builds the equation
    if last_eq is None:
        return ['%s ~ %s' % (response, pred) for pred in vars_]
    else:
        eqs = []
        for pred in vars_:
            if '&' in pred:
                eqs.append(last_eq.replace(' + %s' % pred[1:], ''))
            else:
                eqs.append('%s + %s' % (last_eq, pred))

    return eqs

In [149]:
def _populate_fit_check(fits, var_, id_=1, dname=None):
    """Updates the fit_check information"""
    fit_check = []
    if isinstance(var_, str):
        var_ = [var_]

    for idy, fit_ in enumerate(fits):
        n = fit_.nobs
        k = fit_.df_model
        aicc = fit_.aic + (2 * k * (k + 1) / (n - k - 1))
        d_score = scipy.stats.kstest(fit_.resid.values, 'norm')[0]

        fit_check.append(pd.Series({'data_set': dname,
                                    'equation':  fit_.model.formula,
                                    'var': var_[idy],
                                    'n': n,
                                    'k': k,
                                    'aic': fit_.aic,
                                    'aicc': aicc,
                                    'D_score': d_score,
                                    'pearson_r2': fit_.rsquared,
                                    'adj_r2': fit_.rsquared_adj,
                                    'cond no': fit_.condition_number},
                                   name=id_ + idy))
    return pd.DataFrame(fit_check)

In [150]:
def _check_watch(model_watch, prev_id):
    """..."""
    if model_watch is None:
        model_watch = pd.DataFrame(data=np.zeros((0, 11)),
                                   columns=['data_set', 'equation', 'var', 'n',
                                            'k', 'aic', 'aicc', 'D_score',
                                            'pearson_r2', 'adj_r2', 'cond no'])
        id_ = 1
        prev_id = None
        prev_eq = None
    else:
        id_ = max(model_watch.index) + 1
        if prev_id is None:
            models = pd.DataFrame(model_watch).transpose()
            ranked = models.sort(['aicc'], inplace=False).index
            prev_id = ranked[0]
        prev_eq = model_watch.loc[prev_id, 'equation']
    return model_watch, id_, prev_id, prev_eq

In [151]:
def _identify_best_model(fit_check, prev_id):
    """..."""
    fit_check['ref'] = prev_id
    prev_r2 = fit_check.loc[prev_id, 'adj_r2']
    prev_aicc = fit_check.loc[prev_id, 'aicc']
    fit_check['score'] = -10000*(((fit_check.adj_r2 - prev_r2) *
                                 (fit_check.aicc - prev_aicc)) /
                                  fit_check['cond no'])

    prev_id = fit_check.loc[fit_check.score == fit_check.score.max()].index[0]
        
    return prev_id

In [152]:
def olf_build_model(response, predictors, data, dname=None, model_watch=None,
                    prev_id=None):
    """Builds up a series of Ordinary Least Squares Models

    """
    # Looks for the last model being referenced as the best
    model_watch, id_, prev_id, prev_eq = _check_watch(model_watch, prev_id)

    # Builds up the model
    for var_ in predictors:
        # Builds up the list of equations to be added
        eqs = _equation_builder(var_, prev_eq, response)
        num_var = len(eqs)

        # Fits the equations
        fits = [smf.ols(eq, data=data).fit() for eq in eqs]

        # Summarizes the fit
        fit_check = _populate_fit_check(fits, var_, id_, dname)

        # Updates the fit check with the best previous model
        check_ids = fit_check.index.values
        if prev_id is None:
            prev_id = min(fit_check.index.values)
        else:
            fit_check.loc[prev_id] = model_watch.loc[prev_id]

        # Identifies the best model
        prev_id = _identify_best_model(fit_check, prev_id)

        model_watch = pd.concat((model_watch, fit_check.loc[check_ids]))

        # Gets the best fit equation
        prev_eq = model_watch.loc[prev_id, 'equation']

        # Advances the counter
        id_ = id_ + num_var

    return model_watch, prev_id


In [153]:
model_watch, prev_id = olf_build_model(
        response=response,
        predictors=vars1, 
        data=data.map_.loc[data.map_[columns1].dropna().index],
        dname='all_data',
        model_watch=None,
        prev_id=None
        )

In [154]:
sort_ = model_watch.sort_values('adj_r2', ascending=False)

In [155]:
sort_.loc[sort_.index[0], 'equation']

'PD_whole_tree_10k ~ lnAGE + COUNTRY + CHICKENPOX + TYPES_OF_PLANTS + ALCOHOL_FREQUENCY + EXERCISE_FREQUENCY + EXERCISE_LOCATION + FLOSSING_FREQUENCY'

In [156]:
print smf.ols(sort_.loc[sort_.index[0], 'equation'],
        data.map_.loc[data.map_[columns1].dropna().index],
       ).fit().summary()

                            OLS Regression Results                            
Dep. Variable:      PD_whole_tree_10k   R-squared:                       0.073
Model:                            OLS   Adj. R-squared:                  0.038
Method:                 Least Squares   F-statistic:                     2.113
Date:                Mon, 08 Feb 2016   Prob (F-statistic):            0.00229
Time:                        15:05:46   Log-Likelihood:                -1894.7
No. Observations:                 617   AIC:                             3835.
Df Residuals:                     594   BIC:                             3937.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                                     coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------------

In [157]:
model_watch, prev_id = olf_build_model(
        response=response,
        predictors=['&CSECTION'], 
        data=data.map_.loc[data.map_[vars1[1:]].dropna().index],
        dname='all_data',
        model_watch=model_watch,
        prev_id=13
        )

KeyError: 'the label [13] is not in the [index]'

In [51]:
model_watch[['var', 'score', 'adj_r2', 'cond no']]

Unnamed: 0,var,score,adj_r2,cond no
1,AGE_CORRECTED,-0.0,0.015425,192.438709
2,lnAGE,0.492993,0.017531,56.462439
3,SEX,-0.013083,0.017609,57.088401
4,COUNTRY,0.017035,0.019227,56.462759
5,CHICKENPOX,0.364094,0.018032,57.947843
6,ALCOHOL_FREQUENCY,4.105386,0.01422,60.306225
7,TYPES_OF_PLANTS,4.846804,0.025279,63.302277
8,CSECTION,0.282913,0.024221,64.63992
9,RACE,3.94997,0.031496,65.424034
10,DRINKING_WATER_SOURCE,2.534563,0.028164,70.250151


In [56]:
model_watch.loc[[12, 13, 20]].transpose()

Unnamed: 0,12,13,20
D_score,0.340893,0.341518,0.347631
adj_r2,0.0363381,0.0403305,0.0415517
aic,3837.68,3838.94,3837.2
aicc,3839.54,3841.51,3839.58
cond no,237.093,239.374,239.312
data_set,all_data,all_data,all_data
equation,PD_whole_tree_10k ~ lnAGE + COUNTRY + CHICKENP...,PD_whole_tree_10k ~ lnAGE + COUNTRY + CHICKENP...,PD_whole_tree_10k ~ lnAGE + COUNTRY + CHICKENP...
k,23,27,26
n,617,617,617
pearson_r2,0.072319,0.0823939,0.0820057
