In [5]:
import numpy as np
import skbio
import pandas as pd
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
% matplotlib inline

import americangut.ag_dictionary as agdic
import americangut.diversity_analysis as agdiv
import americangut.notebook_environment as agenv
from americangut.ag_data import AgData



Let's load the full dataset, and explore it.

In [6]:
bodysite = 'fecal'
sequence_trim = '100nt'
rarefaction_depth = '10k'

use_subset = False
use_one_sample = True

In [7]:
fecal_data = AgData(bodysite=bodysite, 
                    trim=sequence_trim, 
                    depth=rarefaction_depth, 
                    sub_participants=use_subset, 
                    one_sample=use_one_sample)

In [8]:
fecal_data.drop_alpha_outliers()
fecal_data.drop_bmi_outliers()
fecal_data.clean_age()

In [9]:
ibd = {u'10317.000012951', u'10317.000017746', u'10317.000001620',
       u'10317.000001748', u'10317.000009382', u'10317.000001544',
       u'10317.000005953', u'10317.000009004', u'10317.000013010',
       u'10317.000014579', u'10317.000001827', u'10317.000013023',
       u'10317.000009157', u'10317.000001362', u'10317.000001135',
       u'10317.000004855', u'10317.000005878', u'10317.000007719',
       u'10317.000009126', u'10317.000007068', u'10317.000010134',
       u'10317.000010112', u'10317.000003300', u'10317.000002830',
       u'10317.000014236', u'10317.000012936', u'10317.000013575',
       u'10317.000001047', u'10317.000005360', u'10317.000001333',
       u'10317.000015581', u'10317.000003189', u'10317.000001128',
       u'10317.000015907', u'10317.000017338', u'10317.000013062',
       u'10317.000013595', u'10317.000009533', u'10317.000010546',
       u'10317.000001685', u'10317.000004163', u'10317.000001751',
       u'10317.000013551', u'10317.000005889', u'10317.000002036',
       u'10317.000002783', u'10317.000004157', u'10317.000001364',
       u'10317.000001622', u'10317.000004783', u'10317.000008999',
       u'10317.000009149', u'10317.000001895', u'10317.000011375',
       u'10317.000014880', u'10317.000001291', u'10317.000010876',
       u'10317.000014607', u'10317.000014987', u'10317.000002271',
       u'10317.000003898', u'10317.000009144', u'10317.000002482',
       u'10317.000014608', u'10317.000013134', u'10317.000001647',
       u'10317.000011959', u'10317.000009626', u'10317.000004025',
       u'10317.000014291', u'10317.000001351', u'10317.000002859',
       u'10317.000014118', u'10317.000004612', u'10317.000018383',
       u'10317.000001575', u'10317.000004161', u'10317.000015873',
       u'10317.000005971', u'10317.000004192', u'10317.000001322',
       u'10317.000004162', u'10317.000002336', u'10317.000010147',
       u'10317.000006673', u'10317.000004752', u'10317.000005851',
       u'10317.000009236', u'10317.000001363', u'10317.000014458',
       u'10317.000011093', u'10317.000009164', u'10317.000005810',
       u'10317.000003047', u'10317.000015849', u'10317.000004790',
       }
actual = set(ibd).intersection(fecal_data.map_.index)
# len(actual)
fecal_data.map_.loc[actual, 'IBD_DIAGONOSIS'] = 'Yes'

In [10]:
questions = ['IBD', 'SEX', 'ANTIBIOTIC_HISTORY', 'COUNTRY', 'RACE', 'BOWEL_MOVEMENT_QUALITY',
             'BOWEL_MOVEMENT_FREQUENCY', 'AGE_CORRECTED']

In [11]:
questions = ['AGE_CORRECTED', 'ALCOHOL_FREQUENCY', 
             'ANTIBIOTIC_HISTORY', 'BMI_CAT',
             'BOWEL_MOVEMENT_FREQUENCY', 'BOWEL_MOVEMENT_QUALITY', 'CAT',
             'CHICKENPOX', 'COLLECTION_MONTH', 'CONSUME_ANIMAL_PRODUCTS_ABX',
             'CONTRACEPTIVE', 'COUNTRY', 'CSECTION', 'DIABETES', 'DIET_TYPE',
             'DRINKING_WATER_SOURCE', 'EXERCISE_FREQUENCY', 'EXERCISE_LOCATION',
             'FED_AS_INFANT', 'FERMENTED_PLANT_FREQUENCY', 'FLOSSING_FREQUENCY',
             'FRUIT_FREQUENCY', 'GLUTEN', 'HOMECOOKED_MEALS_FREQUENCY', 'IBD',
             'LACTOSE', 'LAST_TRAVEL', 'LOWGRAIN_DIET_TYPE', 'LUNG_DISEASE',
             'MIGRAINE', 'MULTIVITAMIN', 'OLIVE_OIL',
             'ONE_LITER_OF_WATER_A_DAY_FREQUENCY', 'POOL_FREQUENCY',
             'PREPARED_MEALS_FREQUENCY', 'PROBIOTIC_FREQUENCY', 'RACE',
             'SEASONAL_ALLERGIES', 'SEX', 'SLEEP_DURATION', 'SMOKING_FREQUENCY',
             'SUGARY_SWEETS_FREQUENCY', 'TYPES_OF_PLANTS', 'VEGETABLE_FREQUENCY',
             'VITAMIN_B_SUPPLEMENT_FREQUENCY', 'VITAMIN_D_SUPPLEMENT_FREQUENCY',
             'WEIGHT_CHANGE']

In [12]:
import numpy as np
import pandas as pd
import scipy
import statsmodels.formula.api as smf


def olf_build_model(response, predictors, data, dname=None, model_watch=None,
                    prev_id=None):
    """Builds up a series of Ordinary Least Squares Models

    """

    # Looks for the last model being referenced as the best
    id_, prev_id, prev_eq = _check_watch(model_watch, prev_id)

    if model_watch is None:
        model_watch = pd.DataFrame(data=np.zeros((0, 11)),
                                   columns=['data_set', 'equation', 'var', 'n',
                                            'k', 'aic', 'aicc', 'D_score',
                                            'pearson_r2', 'adj_r2', 'cond no'])
    # Builds up the model
    for var_ in predictors:
        # Builds up the list of equations to be added
        eqs = _equation_builder(var_, prev_eq, response)
        num_var = len(eqs)

        # Fits the equations
        fits = [smf.ols(eq, data=data).fit() for eq in eqs]

        # Summarizes the fit
        fit_check = _populate_fit_check(fits, var_, id_, dname)

        # Updates the fit check with the best previous model
        check_ids = fit_check.index.values
        if prev_id is None:
            prev_id = min(fit_check.index.values)
        else:
            fit_check.loc[prev_id] = model_watch.loc[prev_id]

        # Identifies the best model
        prev_id = identify_best_model(fit_check, prev_id)

        model_watch = pd.concat((model_watch, fit_check.loc[check_ids]))

        # Gets the best fit equation
        prev_eq = model_watch.loc[prev_id, 'equation']

        # Advances the counter
        id_ = id_ + num_var

    return model_watch, prev_eq, id_


def control_cat_order(df, category, old_order=None, new_order=None,
                      counts=None, drop=None):
    """..."""
    # Sets up default variables
    if old_order is None:
        old_order = sorted(df.groupby(category).groups.keys())
    if new_order is None:
        new_order = old_order
    if counts is None:
        counts = np.arange(0, len(new_order))
    if isinstance(drop, str):
        drop = [drop]
    elif drop is None:
        drop = []

    # Sets the dropped categories to nans
    for cat in drop:
        if cat in old_order:
            raise ValueError('%s cannot be dropped and categorized' % cat)
        df.loc[df[category] == cat, category] = np.nan

    # Orders the new category
    for (old_, new_, count) in zip(*[old_order, new_order, counts]):
        df.loc[df[category] == old_, category] = '(%i)%s' % (count, old_)


def _check_watch(model_watch, prev_id):
    """..."""
    if model_watch is None:
        model_watch = pd.DataFrame(data=np.zeros((0, 11)),
                                   columns=['data_set', 'equation', 'var', 'n',
                                            'k', 'aic', 'aicc', 'D_score',
                                            'pearson_r2', 'adj_r2', 'cond no'])
        id_ = 1
        prev_id = None
        prev_eq = None
    else:
        id_ = max(model_watch.keys()) + 1
        if prev_id is None:
            models = pd.DataFrame(model_watch).transpose()
            ranked = models.sort(['aicc'], inplace=False).index
            prev_id = ranked[0]
        prev_eq = model_watch[prev_id]['equation']
    return model_watch, id_, prev_id, prev_eq


def _equation_builder(vars_, last_eq=None, response=None):
    """Builds a set of equations trying multiple predictor options"""
    # Checks enough variables are defined
    if last_eq is None and response is None:
        raise ValueError('A response or last equation must be specified.')

    # Checks the class of vars
    if isinstance(vars_, str):
        vars_ = [vars_]

    # Builds the equation
    if last_eq is None:
        return ['%s ~ %s' % (response, pred) for pred in vars_]
    else:
        eqs = []
        for pred in vars_:
            if '&' in pred:
                eqs.append(last_eq.replace(' + %s' % pred[1:], ''))
            else:
                eqs.append('%s + %s' % (last_eq, pred))

    return eqs


def _populate_fit_check(fits, var_, id_=1, dname=None):
    """Updates the fit_check information"""
    fit_check = []
    if isinstance(var_, str):
        var_ = [var_]

    for idy, fit_ in enumerate(fits):
        n = fit_.nobs
        k = fit_.df_model
        aicc = fit_.aic + (2 * k * (k + 1) / (n - k - 1))
        d_score = scipy.stats.kstest(fit_.resid.values, 'norm')[0]

        fit_check.append(pd.Series({'data_set': dname,
                                    'equation':  fit_.model.formula,
                                    'var': var_[idy],
                                    'n': n,
                                    'k': k,
                                    'aic': fit_.aic,
                                    'aicc': aicc,
                                    'D_score': d_score,
                                    'pearson_r2': fit_.rsquared,
                                    'adj_r2': fit_.rsquared_adj,
                                    'cond no': fit_.condition_number},
                                   name=id_ + idy))
    return pd.DataFrame(fit_check)


def _identify_best_model(fit_check, prev_id):
    """..."""
    fit_check['ref'] = prev_id
    prev_r2 = fit_check.loc[prev_id, 'adj_r2']
    prev_aicc = fit_check.loc[prev_id, 'aicc']
    fit_check['score'] = -10000*(((fit_check.adj_r2 - prev_r2) *
                                 (fit_check.aicc - prev_aicc)) /
                                 fit_check['cond no'])
    prev_id = fit_check.loc[fit_check.score == fit_check.score.max()].index[0]
    return prev_id


In [13]:
fecal_data.map_.groupby('CAT').count().max(1)

CAT
false    2672
true     1079
dtype: int64

In [14]:
fecal_data.map_.AGE_CORRECTED.min()

1.0

In [15]:
for name in questions[:-1]:
    question = agdic.ag_dictionary(name)
    if name == 'COUNTRY':
        question.order = ['USA', 'Australia', 'Canada', 'United Kingdom']
    elif name == 'IBD':
        question.order = ['No', 'Yes']
    fecal_data.clean_group(question)
    if name == 'AGE_CORRECTED':
        continue
    question.label_order(fecal_data.map_)
map_ = fecal_data.map_

In [16]:
actual = set(ibd).intersection(map_.index)
map_.loc[actual, 'IBD'] = '(0) Yes'

In [17]:
map_.groupby('IBD').count().max(1)

IBD
(0) No     3480
(0) Yes      87
dtype: int64

In [18]:
map_.AGE_CORRECTED.min()

1.0

In [24]:
map_['lnAGE'] = map_['AGE_CORRECTED'].apply(lambda x: np.log(x))

In [29]:
watch_cols = questions
watch_cols.append('lnAGE')
watch_cols.append('PD_whole_tree_10k')

In [26]:
map_[watch_cols].dropna()

Unnamed: 0_level_0,AGE_CORRECTED,ALCOHOL_FREQUENCY,ANTIBIOTIC_HISTORY,BMI_CAT,BOWEL_MOVEMENT_FREQUENCY,BOWEL_MOVEMENT_QUALITY,CAT,CHICKENPOX,COLLECTION_MONTH,CONSUME_ANIMAL_PRODUCTS_ABX,...,SLEEP_DURATION,SMOKING_FREQUENCY,SUGARY_SWEETS_FREQUENCY,TYPES_OF_PLANTS,VEGETABLE_FREQUENCY,VITAMIN_B_SUPPLEMENT_FREQUENCY,VITAMIN_D_SUPPLEMENT_FREQUENCY,WEIGHT_CHANGE,PD_whole_tree_10k,PD_whole_tree_10k
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10317.000010102,33,(3) 3-5 times/week,(3) More than a year,(1) Normal,(1) One,(0) Normal,(1) no,(1) Yes,(9) October,(1) Yes,...,(1) 6-7 hours,(0) Never,(3) 3-5 times/week,(4) More than 30,(3) Daily,(0) Never,(0) Never,Remained stable,29.260214,29.260214
10317.000007728,73,(3) 3-5 times/week,(3) More than a year,(1) Normal,(1) One,(0) Normal,(1) no,(1) Yes,(6) July,(0) No,...,(2) 7-8 hours,(0) Never,(2) 1-2 times/week,(2) 11 to 20,(3) Daily,(0) Never,(3) 3-5 times/week,Remained stable,32.076176,32.076176
10317.000014890,56,(3) 3-5 times/week,(3) More than a year,(1) Normal,(1) One,(2) Diarrhea,(1) no,(1) Yes,(6) July,(0) No,...,(1) 6-7 hours,(0) Never,(2) 1-2 times/week,(4) More than 30,(3) Daily,(0) Never,(3) 3-5 times/week,Remained stable,29.193615,29.193615
10317.000018362,51,(0) Never,(3) More than a year,(2) Overweight,(1) One,(0) Normal,(1) no,(1) Yes,(8) September,(0) No,...,(3) 8 or more hours,(0) Never,(2) 1-2 times/week,(4) More than 30,(3) Daily,(3) 3-5 times/week,(3) 3-5 times/week,Remained stable,40.706463,40.706463
10317.000001892,50,(1) A few times/month,(3) More than a year,(2) Overweight,(1) One,(0) Normal,(1) no,(0) No,(5) June,(0) No,...,(1) 6-7 hours,(0) Never,(2) 1-2 times/week,(1) 6 to 10,(3) Daily,(0) Never,(4) Daily,Remained stable,34.031797,34.031797
10317.000013378,74,(2) 1-2 times/week,(3) More than a year,(1) Normal,(1) One,(0) Normal,(0) yes,(1) Yes,(1) February,(1) Yes,...,(2) 7-8 hours,(0) Never,(2) 1-2 times/week,(2) 11 to 20,(2) 3-5 times/week,(0) Never,(0) Never,Remained stable,43.490282,43.490282
10317.000009148,45,(2) 1-2 times/week,(3) More than a year,(1) Normal,(2) Two,(2) Diarrhea,(0) yes,(1) Yes,(10) November,(1) Yes,...,(1) 6-7 hours,(0) Never,(2) 1-2 times/week,(1) 6 to 10,(1) 1-2 times/week,(0) Never,(0) Never,Remained stable,34.775081,34.775081
10317.000015594,55,(3) 3-5 times/week,(3) More than a year,(1) Normal,(1) One,(0) Normal,(0) yes,(1) Yes,(5) June,(0) No,...,(2) 7-8 hours,(0) Never,(1) Less than once/week,(2) 11 to 20,(3) Daily,(0) Never,(0) Never,Remained stable,40.007861,40.007861
10317.000011191,32,(3) 3-5 times/week,(3) More than a year,(1) Normal,(2) Two,(0) Normal,(1) no,(1) Yes,(2) March,(0) No,...,(2) 7-8 hours,(0) Never,(2) 1-2 times/week,(3) 21 to 30,(3) Daily,(0) Never,(0) Never,Remained stable,30.839628,30.839628
10317.000003001,45,(2) 1-2 times/week,(1) 6 months,(1) Normal,(1) One,(0) Normal,(1) no,(1) Yes,(4) May,(1) Yes,...,(3) 8 or more hours,(0) Never,(3) 3-5 times/week,(2) 11 to 20,(2) 3-5 times/week,(0) Never,(3) 3-5 times/week,Remained stable,27.079434,27.079434


In [27]:
eq = None
model_watch = None
id_=0
prev_id = None
for var in watch_cols[:-1]:
    eqs = _equation_builder(var,
                           response='PD_whole_tree_10k',
                           last_eq=eq
                           )
    fits = [smf.ols(eq, data=map_[watch_cols]).fit() for eq in eqs]
    fit_check = _populate_fit_check(fits, var, id_, '')
    
    check_ids = fit_check.index.values
    if prev_id is None:
        prev_id = min(fit_check.index.values)
#     else:
#         fit_check.loc[prev_id] = model_watch.loc[prev_id]


ValueError: operands could not be broadcast together with shapes (3746,) (3746,2) 

In [36]:
age_ = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED', data=map_[watch_cols]).fit()

In [37]:
print age_.summary()

ValueError: shapes (3746,3) and (3746,3) not aligned: 3 (dim 1) != 3746 (dim 0)

In [32]:
print lage.summary()

ValueError: shapes (3746,3) and (3746,3) not aligned: 3 (dim 1) != 3746 (dim 0)

In [None]:
map_.groupby('SEX').count().max(1)

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX', data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY', data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD', data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE', data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE + ANTIBIOTIC_HISTORY', data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE + ANTIBIOTIC_HISTORY', data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE + ANTIBIOTIC_HISTORY + TYPES_OF_PLANTS', data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE + '
               'ANTIBIOTIC_HISTORY + TYPES_OF_PLANTS + BMI_CAT', 
               data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + RACE + '
               'ANTIBIOTIC_HISTORY + TYPES_OF_PLANTS + BMI_CAT + EXERCISE_FREQUENCY', 
               data=map_[watch_cols]).fit()
print age2.summary()

In [None]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + ANTIBIOTIC_HISTORY + BOWEL_MOVEMENT_QUALITY', data=map_[watch_cols]).fit()
print age2.summary()

In [23]:
age2 = smf.ols('PD_whole_tree_10k ~ lnAGE*SEX + COUNTRY + IBD + ANTIBIOTIC_HISTORY + CHICKENPOX', data=map_[watch_cols]).fit()
print age2.summary()

NameError: name 'lnAGE' is not defined

In [24]:
questions

['IBD',
 'SEX',
 'ANTIBIOTIC_HISTORY',
 'COUNTRY',
 'RACE',
 'BOWEL_MOVEMENT_QUALITY',
 'BOWEL_MOVEMENT_FREQUENCY',
 'AGE_CORRECTED',
 'VIOSCREEN_MANNITOL',
 'VIOSCREEN_SFA100',
 'VIOSCREEN_SFA170',
 'VIOSCREEN_VEGETABLE_SERVINGS',
 'VIOSCREEN_SORBITOL',
 'VIOSCREEN_SFA80',
 'VIOSCREEN_BETACAR',
 'VIOSCREEN_LOW_FAT_DAIRY_SERVING',
 'VIOSCREEN_NIACINEQ',
 'VIOSCREEN_GAMMTOCO',
 'VIOSCREEN_LYSINE',
 'VIOSCREEN_ISOMALT',
 'VIOSCREEN_SUCROSE',
 'VIOSCREEN_CLAC9T11',
 'VIOSCREEN_FIBER',
 'VIOSCREEN_LACTITOL',
 'VIOSCREEN_HEI_NON_JUICE_FRT',
 'VIOSCREEN_OXALICM',
 'VIOSCREEN_ALPHACAR',
 'VIOSCREEN_LYCOPENE',
 'VIOSCREEN_VEG5_DAY',
 'VIOSCREEN_HEI_MILK',
 'VIOSCREEN_CLAT10C12',
 'VIOSCREEN_HEI_MEAT_BEANS',
 'VIOSCREEN_HEI_SOL_FAT_ALC_ADD_SUG',
 'VIOSCREEN_COPPER',
 'VIOSCREEN_HEI2010_REFINED_GRAINS',
 'VIOSCREEN_LUTZEAX',
 'VIOSCREEN_SALAD_VEGETABLE_SERVINGS',
 'VIOSCREEN_SFA200',
 'VIOSCREEN_HEI2010_DAIRY',
 'VIOSCREEN_FIBH2O',
 'VIOSCREEN_GLAC',
 'VIOSCREEN_SFA140',
 'VIOSCREEN_TAGATOSE',
 

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED', data=map_.loc[map_.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map = map_.loc[map_[['AGE_CORRECTED', 'VIOSCREEN_FIBER']].dropna().index]
sub_map['COUNTRY'] = sub_map.COUNTRY.apply(lambda x: x if x in {'USA', 'United Kingdom'} else np.nan)

In [None]:
f_ = lambda x: x if x in {'USA', 'United Kingdom'} else np.nan

In [None]:
abx = agdic.ag_dictionary('ANTIBIOTIC_HISTORY')
abx.remap_groups(sub_map)
abx.label_order(sub_map)

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map['lnAge'] = sub_map.AGE_CORRECTED.apply(lambda x: np.log(x))
sub_map.loc[sub_map.AGE_CORRECTED < 20, 'lnAge'] = np.nan

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge + COUNTRY', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ lnAge + COUNTRY + ANTIBIOTIC_HISTORY', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
mod = smf.ols('PD_whole_tree_10k ~ AGE_CORRECTED + COUNTRY + VIOSCREEN_HEI_SCORE', data=sub_map.loc[sub_map.AGE_CORRECTED > 20]).fit()
print mod.summary()

In [None]:
sub_map.loc[sub_map.AGE_CORRECTED > 20].shape

In [None]:
plt.plot(map_['VIOSCREEN_FIBER'], map_['PD_whole_tree_10k'], 'o')

In [None]:
sorted(vios_cols)

In [None]:
vios_cols