In [43]:
import os

import pandas as pd
from pystan import StanModel
from sklearn.preprocessing import scale

In [44]:
DATA_FOLDER = '../data'
DIVORCE_FULLPATH = os.path.join(DATA_FOLDER, 'WaffleDivorce.csv')

In [45]:
divorce = pd.read_csv(DIVORCE_FULLPATH)
divorce.head()

Unnamed: 0,Location,Loc,Population,MedianAgeMarriage,Marriage,Marriage SE,Divorce,Divorce SE,WaffleHouses,South,Slaves1860,Population1860,PropSlaves1860
0,Alabama,AL,4.78,25.3,20.2,1.27,12.7,0.79,128,1,435080,964201,0.45
1,Alaska,AK,0.71,25.2,26.0,2.93,12.5,2.05,0,0,0,0,0.0
2,Arizona,AZ,6.33,25.8,20.3,0.98,10.8,0.74,18,0,0,0,0.0
3,Arkansas,AR,2.92,24.3,26.4,1.7,13.5,1.22,41,1,111115,435450,0.26
4,California,CA,37.25,26.8,19.1,0.39,8.0,0.24,0,0,0,379994,0.0


In [53]:
divorce_code = """
data {
    int<lower=0> N;
    vector[N] X;
    vector<lower=0, upper=100>[N] divorce_rate;
}
parameters {
    real alpha;
    real beta;
    real<lower=0> sigma;
}
transformed parameters {
    vector<lower=0, upper=100>[N] mu;
    
    mu = alpha + X * beta;
}
model {
    alpha ~ normal(10, 10);
    beta ~ normal(0, 1);
    sigma ~ uniform(0, 10);
    divorce_rate ~ normal(mu, sigma);
}"""

sm = StanModel(model_code=divorce_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_5a2125a5deb1fc80b367d8bed6609b38 NOW.


In [54]:
divorce['MedianAgeMarriage_s'] = scale(
    X=divorce.loc[:, ['MedianAgeMarriage']], with_mean=True, with_std=True)
divorce['Divorce_pct'] = divorce.loc[:, 'Divorce'] / 100

In [56]:
divorce_data = dict(
    N=divorce.shape[0],
    K=1,
    X=divorce.MedianAgeMarriage_s,
    divorce_rate=divorce.Divorce
)

fit = sm.sampling(data=divorce_data)
fit

Inference for Stan model: anon_model_5a2125a5deb1fc80b367d8bed6609b38.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

         mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
alpha    9.69  3.9e-3   0.22   9.26   9.54   9.69   9.84  10.12   3049    1.0
beta    -1.03  3.7e-3   0.22  -1.45  -1.18  -1.03  -0.88   -0.6   3488    1.0
sigma    1.52  2.7e-3   0.16   1.23    1.4   1.51   1.62   1.88   3788    1.0
mu[0]   10.32  4.4e-3   0.26   9.82  10.15  10.32  10.49  10.83   3375    1.0
mu[1]   10.41  4.6e-3   0.27   9.89  10.23  10.41  10.59  10.94   3383    1.0
mu[2]     9.9  4.1e-3   0.22   9.47   9.75    9.9  10.05  10.34   2984    1.0
mu[3]   11.16  6.5e-3   0.38  10.41   10.9  11.16  11.42   11.9   3438    1.0
mu[4]    9.06  4.3e-3   0.25   8.57   8.89   9.06   9.24   9.56   3360    1.0
mu[5]    9.99  4.2e-3   0.23   9.54   9.84   9.98  10.14  10.43   2971    1.0
mu[6]    8.39  5.9e-3   0.34   

In [57]:
divorce['Marriage_s'] = scale(
    X=divorce.loc[:, ['Marriage']], with_mean=True, with_std=True)

In [58]:
divorce2_data = dict(
    N=divorce.shape[0],
    K=1,
    X=divorce.Marriage_s,
    divorce_rate=divorce.Divorce
)

fit2 = sm.sampling(data=divorce2_data)
fit2

Inference for Stan model: anon_model_5a2125a5deb1fc80b367d8bed6609b38.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

         mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
alpha    9.68  4.3e-3   0.25    9.2   9.52   9.69   9.85  10.17   3338    1.0
beta     0.63  3.8e-3   0.24   0.16   0.47   0.64   0.79    1.1   4000    1.0
sigma    1.75  3.1e-3   0.18   1.44   1.62   1.74   1.87   2.15   3386    1.0
mu[0]     9.7  4.3e-3   0.25   9.22   9.53    9.7   9.86  10.18   3338    1.0
mu[1]   10.68  7.4e-3   0.45   9.79  10.38  10.67  10.99  11.57   3785    1.0
mu[2]    9.71  4.3e-3   0.25   9.23   9.55   9.72   9.88   10.2   3339    1.0
mu[3]   10.74  7.7e-3   0.48   9.82  10.43  10.73  11.07  11.68   3796    1.0
mu[4]    9.51  4.3e-3   0.25   9.01   9.34   9.52   9.68  10.01   3533    1.0
mu[5]   10.25  5.5e-3   0.33    9.6  10.03  10.25  10.48  10.91   3680    1.0
mu[6]    9.18  5.1e-3   0.31   

# Multivariate linear model


$$
\begin{align}
    D_i &\sim Normal(\mu_i, \sigma) \\
    \mu_i &= \alpha + \sum_{j=1}^{n} \beta_i x_{ji} \\
    \alpha &\sim Normal(10, 10) \\
    \beta_R &\sim Normal(0, 1) \\
    \beta_A &\sim Normal(0, 1) \\
    \sigma &\sim Uniform(0, 10)
\end{align}
$$

In [62]:
divorce_multi_code = """
data {
    int<lower=0> N;
    int<lower=0> K;
    matrix[N, K] X;
    vector<lower=0, upper=100>[N] divorce_rate;
}
parameters {
    real alpha;
    vector[K] beta;
    real<lower=0> sigma;
}
transformed parameters {
    vector<lower=0, upper=100>[N] mu;
    
    mu = alpha + X * beta;
}
model {
    alpha ~ normal(10, 10);
    beta ~ normal(0, 1);
    sigma ~ uniform(0, 10);
    divorce_rate ~ normal(mu, sigma);
}"""

sm2 = StanModel(model_code=divorce_multi_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_923c83bda6fff13675df80db77ee0171 NOW.


In [63]:
divorce_multi_data = dict(
    N=divorce.shape[0],
    K=2,
    X=divorce.loc[:, ['Marriage_s', 'MedianAgeMarriage_s']],
    divorce_rate = divorce.Divorce
)

fit3 = sm2.sampling(data=divorce_multi_data)
fit3

Inference for Stan model: anon_model_923c83bda6fff13675df80db77ee0171.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

          mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
alpha     9.69  3.7e-3   0.21   9.26   9.55   9.69   9.83  10.12   3439    1.0
beta[0]  -0.13  5.6e-3   0.29  -0.72  -0.32  -0.12   0.07   0.44   2783    1.0
beta[1]  -1.12  5.6e-3   0.29  -1.68  -1.31  -1.12  -0.92  -0.56   2740    1.0
sigma     1.53  2.7e-3   0.17   1.24   1.41   1.52   1.63   1.91   4000    1.0
mu[0]    10.37  4.8e-3   0.28   9.82  10.19  10.37  10.56  10.89   3228    1.0
mu[1]    10.27  7.0e-3   0.41   9.45   9.99  10.28  10.54  11.07   3514    1.0
mu[2]     9.91  3.8e-3   0.22   9.47   9.77   9.92  10.06  10.35   3448    1.0
mu[3]    11.07  6.7e-3   0.42  10.22  10.79  11.07  11.35   11.9   4000    1.0
mu[4]     9.04  4.4e-3   0.25   8.54   8.88   9.04   9.22   9.55   3339    1.0
mu[5]      9.9  5.3e-

# Predictor residual plots