In [1]:
import numpy as np
import pandas as pd

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
bpath = '~/source/rethinking/'

In [4]:
df = pd.read_csv(bpath + 'data/Howell1.csv', sep=';')

In [5]:
df.head()

Unnamed: 0,height,weight,age,male
0,151.765,47.825606,63.0,1
1,139.7,36.485807,63.0,0
2,136.525,31.864838,65.0,0
3,156.845,53.041914,41.0,1
4,145.415,41.276872,51.0,0


In [6]:
df_a = df[df['age'] >= 18].copy()

In [7]:
xbar = np.mean(df_a['weight'])

In [8]:
df_a['centred_weight'] = df_a['weight'] - xbar

In [9]:
import stan

In [10]:
model_code = """
data {
  int<lower=0> N;
  vector[N] x;
  vector[N] y;
}
parameters {
  real alpha;
  real beta;
  real<lower=0, upper=50> sigma;
}
model {
  y ~ normal(alpha + beta * x, sigma);
  alpha ~ normal(178, 20);
  beta ~ normal(0, 1);
}
"""

In [11]:
model_data = {"N": len(df_a),
                "x": df_a['weight'].to_numpy(),
                "y": df_a['height'].to_numpy()}


In [12]:
posterior = stan.build(model_code, data=model_data, random_seed=123)

Building...

Building: found in cache, done.Messages from stanc:
  The parameter sigma has no priors.
  Your Stan program has a parameter sigma with a lower and upper bound in its
  declaration. These hard constraints are not recommended, for two reasons:
  (a) Except when there are logical or physical constraints, it is very
  unusual for you to be sure that a parameter will fall inside a specified
  range, and (b) The infinite gradient induced by a hard constraint can cause
  difficulties for Stan's sampling algorithm. As a consequence, we recommend
  soft constraints rather than hard constraints; for example, instead of
  constraining an elasticity parameter to fall between 0, and 1, leave it
  unconstrained and give it a normal(0.5,0.5) prior distribution.
  Argument 178 suggests there may be parameters that are not unit scale;
  consider rescaling with a multiplier (see manual section 22.12).
  Argument 20 suggests there may be parameters that are not unit scale;
  consider rescal

In [13]:
fit = posterior.sample(num_chains=4, num_samples=1000)

Sampling:   0%
Sampling:   1% (100/8000)
Sampling:   2% (200/8000)
Sampling:   3% (201/8000)
Sampling:   4% (301/8000)
Sampling:   5% (401/8000)
Sampling:   6% (501/8000)
Sampling:   8% (600/8000)
Sampling:   9% (700/8000)
Sampling:  10% (800/8000)
Sampling:  10% (800/8000)
Sampling:  11% (900/8000)
Sampling:  12% (1000/8000)
Sampling:  14% (1100/8000)
Sampling:  15% (1200/8000)
Sampling:  16% (1300/8000)
Sampling:  18% (1400/8000)
Sampling:  21% (1700/8000)
Sampling:  24% (1900/8000)
Sampling:  25% (2000/8000)
Sampling:  28% (2200/8000)
Sampling:  30% (2400/8000)
Sampling:  49% (3900/8000)
Sampling:  69% (5500/8000)
Sampling:  86% (6900/8000)
Sampling: 100% (8000/8000)
Sampling: 100% (8000/8000)
Sampling: 100% (8000/8000), done.
Messages received during sampling:
  Gradient evaluation took 0.000335 seconds
  1000 transitions using 10 leapfrog steps per transition would take 3.35 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.000113 seconds
  1000 transit

In [14]:
preds = fit['alpha'] + fit['beta'] * 65 + np.random.normal(0, fit['sigma'])

In [15]:
def predict_height(weight):
    height_preds = fit['alpha'] + fit['beta'] * weight + np.random.normal(0, fit['sigma'])
    mean_height = np.mean(height_preds)
    height_interval = np.quantile(height_preds, [0.05, 0.95])

    return mean_height, height_interval[0], height_interval[1]

In [16]:
predict_height(45)

(154.55821384903706, 146.13030571976185, 162.8655278607931)

In [17]:
results = pd.DataFrame()

In [18]:
results['weight'] = [45, 40, 65, 31, 53]

In [19]:
results['expected'], results['lower'], results['upper'] = \
    zip(*results['weight'].map(predict_height))

In [20]:
results

Unnamed: 0,weight,expected,lower,upper
0,45,154.607845,146.098309,162.964043
1,40,150.115766,141.793166,158.521893
2,65,172.401891,164.021841,180.781648
3,31,142.293703,133.712671,150.834229
4,53,161.712176,153.345335,169.861623
