In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import imp
import pymc3 as pm
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.formula.api as smf

sn.set_context('talk')

In [2]:
# Optionally suppress warnings in the final version of the notebook
import warnings
warnings.filterwarnings('ignore')

# ECOREG full analysis (part 3: relationships)

This notebook follows on from the work described [here](http://nbviewer.jupyter.org/github/JamesSample/ECOREG/blob/master/ecoreg_full_analysis_1.ipynb) and [here](http://nbviewer.jupyter.org/github/JamesSample/ECOREG/blob/master/ecoreg_full_analysis_2.ipynb). In the first notebook, PCA was used to identify a subset of potentially interesting parameters with reduced collinearity; in the second, potentially interesting patterns highlighted by the PCA were explored by testing for differences between regulated and unregulated sites. In this notebook, I want to explore relationships between the variables, following up on hypotheses developed in notebooks 1 and 2.

In [3]:
# Read basic datasets

# Hydro indicators
in_xls = r'C:\Data\James_Work\Staff\Susi_S\ECOREG\Stats_Input_Data\hydro_indic.xlsx'
hi_df = pd.read_excel(in_xls, sheetname='hydro_indic', index_col=0)
hi_df = hi_df.query('(eco_dataset == "pb") and (time_per == 3)')

# Site props
in_xls = r'C:\Data\James_Work\Staff\Susi_S\ECOREG\Stats_Input_Data\site_props.xlsx'
site_df = pd.read_excel(in_xls, sheetname='site_props', index_col=0)

# MZB
in_xls = r'C:\Data\James_Work\Staff\Susi_S\ECOREG\Stats_Input_Data\mzb_chem_ecol.xlsx'
mzb_df = pd.read_excel(in_xls, sheetname='mzb_data', index_col=0)

# PB
in_xls = r'C:\Data\James_Work\Staff\Susi_S\ECOREG\Stats_Input_Data\pb_chem_ecol.xlsx'
pb_df = pd.read_excel(in_xls, sheetname='pb_data', index_col=0)

In [4]:
# Import custom functions
func_path = r'C:\Data\James_Work\Staff\Susi_S\ECOREG\Python\ECOREG\ecoreg_code.py'

ecoreg = imp.load_source('ecoreg_code', func_path)

## 6. Relationships to explore

### 6.1. Germany

Based on the results in the previous two notebooks, it seems reasonable to concentrate on the following reduced set of response and explanatory variables for Germany:



### 6.2. Norway

#### 6.2.1. 

In [None]:
cols = ['abund', 'n_taxa', 'n_genera', 'sessil', 'acti_filt_feed',
        'swim_div', 'p50', 'cv', 'revs_per_yr', 'av_rise_rt', 
        'av_fall_rt', 'tn', 'toc', 'country', 'regulated']
df = pd.concat([site_df, mzb_df, hi_df], axis=1)[cols]
df = df.query('country == "N"')

df.head()

In [None]:
cols = ['n_taxa', 'tn', 'toc', 'cond', 'mean', 'range', 'max12', 
        'revs_per_yr', 'days_to_p95', 'days_to_max', 'max10', 
        'cv', 'p50', 'country', 'regulated']
df = pd.concat([site_df, mzb_df, hi_df], axis=1)[cols]
df = df.query('country == "N"')

df.head()

In [None]:
# Lists of explan and resp vars
exp_vars = ['tn', 'toc', 'cond', 'mean', 'range', 'max12', 'revs_per_yr', 
            'days_to_p95', 'days_to_max', 'max10', 'cv']
resp_var = 'n_taxa'

res_df, fig = ecoreg.plot_lasso_path(df, resp_var, exp_vars)

mpld3.display(fig)

In [None]:
# Lists of explan and resp vars
exp_vars = ['tn', 'cv', 'toc', 'revs_per_yr', 'mean']
resp_var = 'n_taxa'

res_df, fig = ecoreg.plot_lasso_path(df, resp_var, exp_vars)

mpld3.display(fig)

In [None]:
# Lists of explan and resp vars
exp_vars = ['tn', 'toc', 'cond', 'mean', 'range', 'max12', 'revs_per_yr', 
            'days_to_p95', 'days_to_max', 'max10', 'cv']
resp_var = 'n_taxa'

params = best_lasso(df, resp_var, exp_vars, kcv=3, cv_path=True, hists=True)
params

In [None]:
mod_str = 'n_taxa ~ tn'# + cv'

# Regression. Pass alpha=0 for OLS. Larger alpha gives a bigger penalty on the
# size of the parameter estimates
model = smf.ols(mod_str, data=df).fit_regularized(alpha=0, l1_wt=0)

print model.summary()

# Plot
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(model, fig=fig)

In [None]:
import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Lists of explan and resp vars
exp_vars = ['tn', 'toc', 'cond', 'mean', 'range', 'max12', 'revs_per_yr', 
            'days_to_p95', 'days_to_max', 'max10', 'cv']
resp_var = 'n_taxa'

# Dict to store BIC values
bics = {}

# Loop over all combinations
for k in range(1, len(exp_vars)+1):
    for variables in itertools.combinations(exp_vars, k):
        preds = df[list(variables)]
        
        # Add constant
        preds = sm.add_constant(preds)
        
        # Compute OLS results
        res = sm.OLS(df[[resp_var,]], preds).fit()
        
        # Add result to dict
        bics[variables] = res.bic

# Get the combination with lowest BIC
best_vars = list(min(bics, key=bics.get))

# Print regression results for these vars
preds = df[list(best_vars)]

# Add constant
preds = sm.add_constant(preds)

# Compute OLS results
res = sm.OLS(df[[resp_var,]], preds).fit()

print 'Regression results for the model with the lowest BIC:\n'
print res.summary()

In [None]:
# Robust Bayesian regression
var_map = {'x':'toc',
           'y':'n_taxa'}

res = ecoreg.robust_lin_reg(df, var_map, plot_trace=True,
                            plot_vars=True, mcmc='slice', 
                            steps=10000)