In [1]:
import os

import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from pystan import StanModel
from sklearn.preprocessing import scale

os.environ['PYSTAN_CACHE_PATH'] = '/home/gianluca/git/rethinking/cache'
os.environ['PYSTAN_MODEL_PATH'] = '/home/gianluca/git/rethinking/models'

from rethinking import StanCache
from rethinking.plotting import summaryplot

output_notebook()

In [2]:
DATA_FOLDER = '../data'
DIVORCE_FULLPATH = os.path.join(DATA_FOLDER, 'WaffleDivorce.csv')

In [3]:
divorce = pd.read_csv(DIVORCE_FULLPATH)
divorce.head()

Unnamed: 0,Location,Loc,Population,MedianAgeMarriage,Marriage,Marriage SE,Divorce,Divorce SE,WaffleHouses,South,Slaves1860,Population1860,PropSlaves1860
0,Alabama,AL,4.78,25.3,20.2,1.27,12.7,0.79,128,1,435080,964201,0.45
1,Alaska,AK,0.71,25.2,26.0,2.93,12.5,2.05,0,0,0,0,0.0
2,Arizona,AZ,6.33,25.8,20.3,0.98,10.8,0.74,18,0,0,0,0.0
3,Arkansas,AR,2.92,24.3,26.4,1.7,13.5,1.22,41,1,111115,435450,0.26
4,California,CA,37.25,26.8,19.1,0.39,8.0,0.24,0,0,0,379994,0.0


$$
\begin{align*}
    D_i &\sim Normal(\mu_i, \sigma) \\
    \mu_i &= \alpha + \beta_A A_i \\
    \alpha &\sim Normal(10, 10) \\
    \beta_A &\sim Normal(0, 1) \\
    \sigma &\sim Uniform(0, 10)
\end{align*}
$$

In [4]:
m5_1 = StanCache(model_name='m5_1').compile()
m5_1.model_code

data {
    int<lower=0> N;
    vector[N] X;
    vector<lower=0, upper=100>[N] divorce_rate;
}
parameters {
    real alpha;
    real beta;
    real<lower=0> sigma;
}
transformed parameters {
    vector<lower=-100, upper=100>[N] mu;

    mu = alpha + X * beta;
}
model {
    alpha ~ normal(10, 10);
    beta ~ normal(0, 1);
    sigma ~ uniform(0, 10);
    divorce_rate ~ normal(mu, sigma);
}



In [5]:
divorce['MedianAgeMarriage_s'] = scale(
    X=divorce.loc[:, ['MedianAgeMarriage']], with_mean=True, with_std=True)
divorce['Divorce_pct'] = divorce.loc[:, 'Divorce'] / 100

In [6]:
divorce_data = dict(
    N=divorce.shape[0],
    K=1,
    X=divorce.MedianAgeMarriage_s,
    divorce_rate=divorce.Divorce
)

fit5_1 = m5_1.sampling(data=divorce_data, iter=500)
print(fit5_1.stansummary(pars=['alpha', 'beta', 'sigma']))

Inference for Stan model: anon_model_069dd64a1ee6b8cdacde859166ae13ef.
4 chains, each with iter=500; warmup=250; thin=1; 
post-warmup draws per chain=250, total post-warmup draws=1000.

        mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
alpha   9.68  7.2e-3   0.23   9.25   9.53   9.68   9.84  10.12    991    1.0
beta   -1.03  7.1e-3   0.19  -1.41  -1.16  -1.04  -0.91  -0.65    715    1.0
sigma   1.51  5.2e-3   0.15   1.23    1.4   1.49    1.6   1.88    879    1.0

Samples were drawn using NUTS at Sun Mar  4 12:48:40 2018.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).


  elif np.issubdtype(np.asarray(v).dtype, float):


In [7]:
summaryplot(fit5_1, pars=['alpha', 'beta', 'sigma'])

In [9]:
divorce['Marriage_s'] = scale(
    X=divorce.loc[:, ['Marriage']], with_mean=True, with_std=True)

In [10]:
divorce2_data = dict(
    N=divorce.shape[0],
    K=1,
    X=divorce.Marriage_s,
    divorce_rate=divorce.Divorce
)

fit5_2 = m5_1.sampling(data=divorce2_data, iter=500)
print(fit5_2.stansummary(pars=['alpha', 'beta', 'sigma']))

  elif np.issubdtype(np.asarray(v).dtype, float):


Inference for Stan model: anon_model_069dd64a1ee6b8cdacde859166ae13ef.
4 chains, each with iter=500; warmup=250; thin=1; 
post-warmup draws per chain=250, total post-warmup draws=1000.

        mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
alpha   9.69  9.5e-3   0.26   9.18   9.51    9.7   9.87  10.16    720    1.0
beta    0.63  8.4e-3   0.24   0.16   0.49   0.63   0.79   1.09    792    1.0
sigma   1.75  6.9e-3   0.18   1.44   1.62   1.74   1.86   2.14    710    1.0

Samples were drawn using NUTS at Sun Mar  4 12:59:02 2018.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(results, f)


# Multivariate linear model


$$
\begin{align}
    D_i &\sim Normal(\mu_i, \sigma) \\
    \mu_i &= \alpha + \sum_{j=1}^{n} \beta_i x_{ji} \\
    \alpha &\sim Normal(10, 10) \\
    \beta_R &\sim Normal(0, 1) \\
    \beta_A &\sim Normal(0, 1) \\
    \sigma &\sim Uniform(0, 10)
\end{align}
$$

In [11]:
m5_3 = StanCache(model_name='m5_3').compile()
m5_3.model_code

data {
    int<lower=0> N;
    int<lower=0> K;
    matrix[N, K] X;
    vector<lower=0, upper=100>[N] divorce_rate;
}
parameters {
    real alpha;
    vector[K] beta;
    real<lower=0> sigma;
}
transformed parameters {
    vector<lower=-100, upper=100>[N] mu;

    mu = alpha + X * beta;
}
model {
    alpha ~ normal(10, 10);
    beta ~ normal(0, 1);
    sigma ~ uniform(0, 10);
    divorce_rate ~ normal(mu, sigma);
}



In [12]:
divorce_multi_data = dict(
    N=divorce.shape[0],
    K=2,
    X=divorce.loc[:, ['Marriage_s', 'MedianAgeMarriage_s']],
    divorce_rate = divorce.Divorce
)

fit5_3 = m5_3.sampling(data=divorce_multi_data, iter=500)
print(fit5_3.stansummary(pars=['alpha', 'beta', 'sigma']))

Inference for Stan model: anon_model_623c071ba8c404bae8e2868021df5aa8.
4 chains, each with iter=500; warmup=250; thin=1; 
post-warmup draws per chain=250, total post-warmup draws=1000.

          mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
alpha     9.69  7.5e-3   0.22   9.25   9.56   9.69   9.84  10.11    844    1.0
beta[0]  -0.14    0.02    0.3  -0.72  -0.34  -0.15   0.06   0.46    340   1.02
beta[1]  -1.13    0.02   0.31  -1.71  -1.34  -1.14  -0.93   -0.5    316   1.02
sigma     1.53  5.8e-3   0.16   1.25   1.42   1.51   1.62   1.88    811    1.0

Samples were drawn using NUTS at Sun Mar  4 12:44:15 2018.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).


  elif np.issubdtype(np.asarray(v).dtype, float):


In [17]:
summaryplot(fit5_3, pars=['alpha', 'beta', 'sigma'])

# Predictor residual plots

In [21]:
from bokeh.plotting import figure, output_notebook, show

output_notebook()

In [57]:
import numpy as np

samples = fit3.extract(permuted=True)
for par in ['alpha']:
    par_samples = samples[par]
    
    par_mean, par_std = np.mean(par_samples), np.std(par_samples) 

In [29]:
from bokeh.plotting import figure, show, output_file

factors = ["a", "b", "c", "d", "e", "f", "g", "h"]
x =  [50, 40, 65, 10, 25, 37, 80, 60]

dot = figure(title="Categorical Dot Plot", tools="", toolbar_location=None,
            y_range=factors, x_range=[0,100])

dot.segment(0, factors, x, factors, line_width=2, line_color="green", )
dot.circle(x, factors, size=15, fill_color="orange", line_color="green", line_width=3, )

show(dot)

