In [1]:


import numpy as np
import skbio
import pandas as pd
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
% matplotlib inline

import americangut.ag_dictionary as agdic
import americangut.diversity_analysis as agdiv
import americangut.notebook_environment as agenv
from americangut.ag_data import AgData



In [2]:
import numpy as np
import pandas as pd
import scipy
import statsmodels.formula.api as smf


def olf_build_model(response, predictors, data, dname=None, model_watch=None,
                    prev_id=None):
    """Builds up a series of Ordinary Least Squares Models

    """

    # Looks for the last model being referenced as the best
    model_watch, id_, prev_id, prev_eq = _check_watch(model_watch, prev_id)

    if model_watch is None:
        model_watch = pd.DataFrame(data=np.zeros((0, 11)),
                                   columns=['data_set', 'equation', 'var', 'n',
                                            'k', 'aic', 'aicc', 'D_score',
                                            'pearson_r2', 'adj_r2', 'cond no'])
    # Builds up the model
    for var_ in predictors:
        # Builds up the list of equations to be added
        eqs = _equation_builder(var_, prev_eq, response)
        num_var = len(eqs)

        # Fits the equations
        fits = [smf.ols(eq, data=data).fit() for eq in eqs]

        # Summarizes the fit
        fit_check = _populate_fit_check(fits, var_, id_, dname)

        # Updates the fit check with the best previous model
        check_ids = fit_check.index.values
        if prev_id is None:
            prev_id = min(fit_check.index.values)
        else:
            fit_check.loc[prev_id] = model_watch.loc[prev_id]

        # Identifies the best model
        prev_id = _identify_best_model(fit_check, prev_id)

        model_watch = pd.concat((model_watch, fit_check.loc[check_ids]))

        # Gets the best fit equation
        prev_eq = model_watch.loc[prev_id, 'equation']

        # Advances the counter
        id_ = id_ + num_var

    return model_watch, prev_eq


def control_cat_order(df, category, old_order=None, new_order=None,
                      counts=None, drop=None):
    """..."""
    # Sets up default variables
    if old_order is None:
        old_order = sorted(df.groupby(category).groups.keys())
    if new_order is None:
        new_order = old_order
    if counts is None:
        counts = np.arange(0, len(new_order))
    if isinstance(drop, str):
        drop = [drop]
    elif drop is None:
        drop = []

    # Sets the dropped categories to nans
    for cat in drop:
        if cat in old_order:
            raise ValueError('%s cannot be dropped and categorized' % cat)
        df.loc[df[category] == cat, category] = np.nan

    # Orders the new category
    for (old_, new_, count) in zip(*[old_order, new_order, counts]):
        df.loc[df[category] == old_, category] = '(%i)%s' % (count, old_)


def _check_watch(model_watch, prev_id):
    """..."""
    if model_watch is None:
        model_watch = {}
        id_ = 1
        prev_id = None
        prev_eq = None
    else:
        id_ = max(model_watch.keys()) + 1
        if prev_id is None:
            models = pd.DataFrame(model_watch).transpose()
            ranked = models.sort(['aicc'], inplace=False).index
            prev_id = ranked[0]
        prev_eq = model_watch[prev_id]['equation']
    return model_watch, id_, prev_id, prev_eq


def _equation_builder(vars_, last_eq=None, response=None):
    """Builds a set of equations trying multiple predictor options"""
    # Checks enough variables are defined
    if last_eq is None and response is None:
        raise ValueError('A response or last equation must be specified.')

    # Checks the class of vars
    if isinstance(vars_, str):
        vars_ = [vars_]

    # Builds the equation
    if last_eq is None:
        return ['%s ~ %s' % (response, pred) for pred in vars_]
    else:
        eqs = []
        for pred in vars_:
            if '&' in pred:
                eqs.append(last_eq.replace(' + %s' % pred[1:], ''))
            else:
                eqs.append('%s + %s' % (last_eq, pred))

    return eqs


def _populate_fit_check(fits, var_, id_=1, dname=None):
    """Updates the fit_check information"""
    fit_check = []
    if isinstance(var_, str):
        var_ = [var_]

    for idy, fit_ in enumerate(fits):
        n = fit_.nobs
        k = fit_.df_model
        aicc = fit_.aic + (2 * k * (k + 1) / (n - k - 1))
        d_score = scipy.stats.kstest(fit_.resid.values, 'norm')[0]

        fit_check.append(pd.Series({'data_set': dname,
                                    'equation':  fit_.model.formula,
                                    'var': var_[idy],
                                    'n': n,
                                    'k': k,
                                    'aic': fit_.aic,
                                    'aicc': aicc,
                                    'D_score': d_score,
                                    'pearson_r2': fit_.rsquared,
                                    'adj_r2': fit_.rsquared_adj,
                                    'cond no': fit_.condition_number},
                                   name=id_ + idy))
    return pd.DataFrame(fit_check)


def _identify_best_model(fit_check, prev_id):
    """..."""
    fit_check['ref'] = prev_id
    prev_r2 = fit_check.loc[prev_id, 'adj_r2']
    prev_aicc = fit_check.loc[prev_id, 'aicc']
    fit_check['score'] = -10000*(((fit_check.adj_r2 - prev_r2) *
                                 (fit_check.aicc - prev_aicc)) /
                                 fit_check['cond no'])
    prev_id = fit_check.loc[fit_check.score == fit_check.score.max()].index[0]
    return prev_id


In [3]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = False
use_one_sample = True

In [4]:
fecal_data = AgData(bodysite=bodysite, 
                    trim=sequence_trim, 
                    depth=rarefaction_depth, 
                    sub_participants=use_subset, 
                    one_sample=use_one_sample)

In [5]:
fecal_data.drop_alpha_outliers()
fecal_data.drop_bmi_outliers()
fecal_data.clean_age()

In [7]:
ibd = {u'10317.000012951', u'10317.000017746', u'10317.000001620',
       u'10317.000001748', u'10317.000009382', u'10317.000001544',
       u'10317.000005953', u'10317.000009004', u'10317.000013010',
       u'10317.000014579', u'10317.000001827', u'10317.000013023',
       u'10317.000009157', u'10317.000001362', u'10317.000001135',
       u'10317.000004855', u'10317.000005878', u'10317.000007719',
       u'10317.000009126', u'10317.000007068', u'10317.000010134',
       u'10317.000010112', u'10317.000003300', u'10317.000002830',
       u'10317.000014236', u'10317.000012936', u'10317.000013575',
       u'10317.000001047', u'10317.000005360', u'10317.000001333',
       u'10317.000015581', u'10317.000003189', u'10317.000001128',
       u'10317.000015907', u'10317.000017338', u'10317.000013062',
       u'10317.000013595', u'10317.000009533', u'10317.000010546',
       u'10317.000001685', u'10317.000004163', u'10317.000001751',
       u'10317.000013551', u'10317.000005889', u'10317.000002036',
       u'10317.000002783', u'10317.000004157', u'10317.000001364',
       u'10317.000001622', u'10317.000004783', u'10317.000008999',
       u'10317.000009149', u'10317.000001895', u'10317.000011375',
       u'10317.000014880', u'10317.000001291', u'10317.000010876',
       u'10317.000014607', u'10317.000014987', u'10317.000002271',
       u'10317.000003898', u'10317.000009144', u'10317.000002482',
       u'10317.000014608', u'10317.000013134', u'10317.000001647',
       u'10317.000011959', u'10317.000009626', u'10317.000004025',
       u'10317.000014291', u'10317.000001351', u'10317.000002859',
       u'10317.000014118', u'10317.000004612', u'10317.000018383',
       u'10317.000001575', u'10317.000004161', u'10317.000015873',
       u'10317.000005971', u'10317.000004192', u'10317.000001322',
       u'10317.000004162', u'10317.000002336', u'10317.000010147',
       u'10317.000006673', u'10317.000004752', u'10317.000005851',
       u'10317.000009236', u'10317.000001363', u'10317.000014458',
       u'10317.000011093', u'10317.000009164', u'10317.000005810',
       u'10317.000003047', u'10317.000015849', u'10317.000004790',
       }
actual = set(ibd).intersection(fecal_data.map_.index)
# len(actual)
fecal_data.map_.loc[actual, 'IBD_DIAGONOSIS'] = 'Yes'

In [8]:
map_ = fecal_data.map_.loc[fecal_data.map_.AGE_CORRECTED > 20]

In [15]:
questions = ['IBD', 'SEX', 'ANTIBIOTIC_HISTORY', 'COUNTRY', 'RACE', 'SEX', 'BOWEL_MOVEMENT_QUALITY', 'BOWEL_MOVEMENT_FREQUENCY']

In [16]:
for name in questions:
    question = agdic.ag_dictionary(name)
    fecal_data.clean_group(question)
    question.label_order(fecal_data.map_)
map_ = fecal_data.map_.loc[fecal_data.map_['AGE_CORRECTED'] > 19]

In [None]:
map_.

In [17]:
map_.groupby('IBD').count().max(1)

IBD
(0) Yes      76
(1) No     3404
dtype: int64

In [27]:
vios_skip = {'VIOSCREEN_FINISHED',
             'VIOSCREEN_DATABASE',
             'VIOSCREEN_MULTIVITAMIN',
             'VIOSCREEN_QUESTIONNAIRE',
             'VIOSCREEN_GENDER',
             'VIOSCREEN_SCFV',
             'VIOSCREEN_PROTOCOL',
             'VIOSCREEN_CALCIUM_FREQ',
             'VIOSCREEN_CALCIUM',
             'VIOSCREEN_STARTED',
             'VIOSCREEN_NUTRIENT_RECOMMENDATION',
             'VIOSCREEN_PROCDATE',
             'VIOSCREEN_DOB',
             'VIOSCREEN_SCF',
             'VIOSCREEN_MULTIVITAMIN_FREQ'}
vios_cols = [x for x in map_.columns if 'VIOSCREEN' in x and not x in vios_skip]
for col in vios_cols:
    map_[col] = map_[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [28]:
map_['lnAGE'] = map_['AGE_CORRECTED'].apply(lambda x: np.log(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [35]:
olf_build_model('PD_whole_tree_10k', ['AGE_CORRECTED', 'lnAGE', 'VIOSCREEN_HEI'], map_)

TypeError: cannot concatenate a non-NDFrame object

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED', data=map_.loc[map_.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map = map_.loc[map_[['AGE_CORRECTED', 'VIOSCREEN_FIBER']].dropna().index]
sub_map['COUNTRY'] = sub_map.COUNTRY.apply(lambda x: x if x in {'USA', 'United Kingdom'} else np.nan)

In [None]:
f_ = lambda x: x if x in {'USA', 'United Kingdom'} else np.nan

In [None]:
abx = agdic.ag_dictionary('ANTIBIOTIC_HISTORY')
abx.remap_groups(sub_map)
abx.label_order(sub_map)

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map['lnAge'] = sub_map.AGE_CORRECTED.apply(lambda x: np.log(x))
sub_map.loc[sub_map.AGE_CORRECTED < 20, 'lnAge'] = np.nan

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge + COUNTRY', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge + COUNTRY + ANTIBIOTIC_HISTORY', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED + COUNTRY + VIOSCREEN_HEI_SCORE', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map.loc[sub_map.AGE_CORRECTED > 20].shape

In [None]:
plt.plot(map_['VIOSCREEN_FIBER'], map_['PD_whole_tree_10k'], 'o')

In [None]:
sorted(vios_cols)

In [None]:
vios_cols