In [1]:
import itertools
import pandas as pd
import numpy as np
import toytree
import toyplot
import arviz as az
import pymc3 as pm
from pymc3.distributions.dist_math import normal_lccdf, normal_lcdf

### Tree and dataframe setup

In [2]:
# generate a random tree
NSPECIES = 80
TREE = toytree.rtree.bdtree(
    ntips=NSPECIES,
    seed=444,
).mod.node_scale_root_height(1.0)

# node idxs that delimit several distinct clades on this tree
CLADES = [152, 153, 154, 155]

# draw and color the four major clades
TREE.draw(
    layout='d', 
    width=500,
    tip_labels=False,
    edge_colors=TREE.get_edge_values_mapped({
        j: toytree.colors[i] for i, j in enumerate(CLADES)
    }),
    scalebar=True,
);

In [3]:
# make group index (gidx)
crown_dict = {i: TREE.get_tip_labels(i) for i in CLADES}
gidx = np.zeros(TREE.ntips, dtype=int)
for tidx, tip in enumerate(TREE.get_tip_labels()):
    for cidx, clade in enumerate(crown_dict):
        if tip in crown_dict[clade]:
            gidx[tidx] = cidx
gidx

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [5]:
# True param values
𝛼_mean = 0.05
𝛼_std = 0.02
𝛽_mean = 1.5
𝛽_std = 0.05
𝜓_mean = 0.0
𝜓_std = 0.33
𝜎_std = 0.05

# 4 different clade effects on rate of RI (used for partial-pooling data)
𝜓_0_mean = 1.0
𝜓_0_std = 0.1
𝜓_1_mean = 0.5
𝜓_1_std = 0.05
𝜓_2_mean = -0.5
𝜓_2_std = 0.05
𝜓_3_mean = -1.0
𝜓_3_std = 0.1

In [6]:
# species dataframe
SPECIES_DATA = pd.DataFrame({
    "gidx": gidx,
    "b": np.random.normal(𝛽_mean, 𝛽_std, TREE.ntips),
    "psi": np.random.normal(𝜓_mean, 𝜓_std, TREE.ntips),
    "psi_x": np.concatenate([
        np.random.normal(𝜓_0_mean, 𝜓_0_std, len(gidx[gidx == 0])),
        np.random.normal(𝜓_1_mean, 𝜓_1_std, len(gidx[gidx == 1])),
        np.random.normal(𝜓_2_mean, 𝜓_2_std, len(gidx[gidx == 2])),
        np.random.normal(𝜓_3_mean, 𝜓_3_std, len(gidx[gidx == 3])),
    ]),
})
SPECIES_DATA.head()

Unnamed: 0,gidx,b,psi,psi_x
0,0,1.53641,0.229932,1.115651
1,0,1.552872,0.026556,0.90466
2,0,1.48789,-0.321123,1.075748
3,0,1.528174,-0.086356,1.094559
4,0,1.503244,0.444541,1.042192


### Generate crossing data

In [7]:
def get_dist(tree, idx0, idx1):
    "returns the genetic distance between two nodes on a tree"
    dist = tree.treenode.get_distance(
        tree.idx_dict[idx0], 
        tree.idx_dict[idx1],
    )
    return dist

In [8]:
# get all combinations of two sampled taxa
a, b = zip(*itertools.combinations(range(NSPECIES), 2))

# organize into DF and get genetic distance between pairs
DATA = pd.DataFrame({
    "sidx0": a,
    "sidx1": b,
    "dist": [(get_dist(TREE, i, j) / 2) for (i, j) in zip(a, b)],
})

DATA['b'] = np.random.normal(𝛽_mean, 𝛽_std, DATA.shape[0])
DATA['velo'] = (
    DATA['b']
    + SPECIES_DATA['psi'][DATA.sidx0].values
    + SPECIES_DATA['psi'][DATA.sidx1].values
)
DATA['velo_x'] = (
    DATA['b']
    + SPECIES_DATA['psi_x'][DATA.sidx0].values
    + SPECIES_DATA['psi_x'][DATA.sidx1].values
)
DATA['intercept'] = np.random.normal(𝛼_mean, 𝛼_std, DATA.shape[0])
DATA['error'] = np.random.normal(0.0, 𝜎_std, DATA.shape[0])

DATA['RI_pooled'] = DATA.error + (DATA.intercept + DATA.b * DATA.dist) / (1 + DATA.b * DATA.dist)
DATA['RI_unpooled'] = DATA.error + (DATA.intercept + DATA.velo * DATA.dist) / (1 + DATA.velo * DATA.dist)
DATA['RI_partpooled'] = DATA.error + (DATA.intercept + DATA.velo_x * DATA.dist) / (1 + DATA.velo_x * DATA.dist)

In [9]:
# censor values in range 0-1
DATA.loc[DATA['RI_pooled'] < 0, 'RI_pooled'] = 0
DATA.loc[DATA['RI_pooled'] > 1, 'RI_pooled'] = 1
DATA.loc[DATA['RI_unpooled'] < 0, 'RI_unpooled'] = 0
DATA.loc[DATA['RI_unpooled'] > 1, 'RI_unpooled'] = 1
DATA.loc[DATA['RI_partpooled'] < 0, 'RI_partpooled'] = 0
DATA.loc[DATA['RI_partpooled'] > 1, 'RI_partpooled'] = 1

In [10]:
DATA

Unnamed: 0,sidx0,sidx1,dist,b,velo,velo_x,intercept,error,RI_pooled,RI_unpooled,RI_partpooled
0,0,1,0.014093,1.569123,1.825611,3.589434,0.058843,0.003385,0.082589,0.085834,0.107543
1,0,2,0.093817,1.475324,1.384133,3.666723,0.060876,-0.006604,0.168453,0.162206,0.294644
2,0,3,0.232354,1.524398,1.667974,3.734608,0.063433,-0.013034,0.295365,0.311992,0.485525
3,0,4,0.232354,1.568007,2.242480,3.725850,0.080916,0.064646,0.390995,0.460403,0.572029
4,0,5,0.234463,1.587113,2.505924,3.761738,0.074592,-0.058734,0.266829,0.358349,0.449548
...,...,...,...,...,...,...,...,...,...,...,...
3155,76,78,0.057491,1.515498,2.497094,-0.487928,0.030432,-0.105419,0.002719,0.046731,0.000000
3156,76,79,0.194995,1.430819,1.861327,-0.579221,0.045074,-0.024966,0.228417,0.274402,0.000000
3157,77,78,0.057491,1.455528,2.218625,-0.729714,0.018721,0.122620,0.217113,0.252345,0.098371
3158,77,79,0.194995,1.432789,1.644798,-0.759066,0.117312,0.051167,0.361236,0.382832,0.015131


In [11]:
NSAMPLES = 1000
SAMPLE = DATA.sample(NSAMPLES).copy().reset_index(drop=True)
SAMPLE.head()

Unnamed: 0,sidx0,sidx1,dist,b,velo,velo_x,intercept,error,RI_pooled,RI_unpooled,RI_partpooled
0,1,26,0.523417,1.570259,1.690114,3.555288,0.037579,-0.025917,0.445831,0.463415,0.637677
1,33,53,0.555803,1.505988,0.773893,3.082311,0.001843,-0.02069,0.435957,0.281362,0.611415
2,34,63,1.0,1.564016,1.02706,1.963146,0.06547,-0.01529,0.620231,0.523682,0.669325
3,13,67,1.0,1.458742,1.518098,1.991522,0.042036,0.031772,0.642156,0.65134,0.711545
4,18,45,0.541446,1.532731,1.904726,3.538284,0.025705,0.091338,0.558905,0.611699,0.757194


### Visualize data

In [12]:
canvas = toyplot.Canvas(width=350, height=300)
axes = canvas.cartesian(
    label="Pooled data",
    xlabel="Genetic dist.",
    ylabel="Reprod. incomp.",
)

# points are jittered on x-axis for visibility
axes.scatterplot(
    SAMPLE.dist,
    SAMPLE.RI_pooled,
    size=8,
    opacity=0.25,
    color=SAMPLE.loc[:, 'RI_pooled'].values,
    mstyle={"stroke": "black"},
);
axes.x.ticks.show = True
axes.y.ticks.show = True

In [13]:
canvas = toyplot.Canvas(width=350, height=300)
axes = canvas.cartesian(
    label="Unpooled data",
    xlabel="Genetic dist.",
    ylabel="Reprod. incomp.",
)

# points are jittered on x-axis for visibility
axes.scatterplot(
    SAMPLE.dist,
    SAMPLE.RI_unpooled,
    size=8,
    opacity=0.25,
    color=SAMPLE.loc[:, 'RI_unpooled'].values,
    mstyle={"stroke": "black"},
);
axes.x.ticks.show = True
axes.y.ticks.show = True

In [14]:
canvas = toyplot.Canvas(width=350, height=300)
axes = canvas.cartesian(
    label="Partpooled data",
    xlabel="Genetic dist.",
    ylabel="Reprod. incomp.",
)

# points are jittered on x-axis for visibility
axes.scatterplot(
    SAMPLE.dist,
    SAMPLE.RI_partpooled,
    size=8,
    opacity=0.25,
    color=SAMPLE.loc[:, 'RI_partpooled'].values,
    mstyle={"stroke": "black"},
);
axes.x.ticks.show = True
axes.y.ticks.show = True

### Funcrions to plot results

In [16]:
def toytrace(trace, var_names, titles):
    """
    Plot posterior trace with toyplot
    """
    nvars = len(var_names)
    
    # setup canvase
    canvas = toyplot.Canvas(width=500, height=200 * nvars)
    
    # store axes
    axes = []
    
    # iter over params
    for pidx, param in enumerate(var_names):
        
        # get param posterior
        posterior = trace.get_values(param)
        
        # setup axes 
        ax = canvas.cartesian(grid=(nvars, 1, pidx))
        ax.y.show = False
        ax.x.spine.style = {"stroke-width": 1.5}
        ax.x.ticks.labels.style = {"font-size": "12px"}
        ax.x.ticks.show = True
        ax.x.label.text = f"param='{titles[pidx]}'"        
        
        # iterate over shape of param
        for idx in range(posterior.shape[1]):
            mags, bins = np.histogram(posterior[:, idx], bins=100)
            ax.plot(bins[1:], mags, stroke_width=2, opacity=0.6)
        axes.append(ax)
    return canvas, axes

In [17]:
import scipy.stats as stats

def draw_velocity_dists(trace, baseline=0.15):
    """
    Draw the clade velocities as gaussians
    """
    canvas = toyplot.Canvas(width=350, height=300)
    axes = canvas.cartesian(xlabel="Relative velocity of reproductive isolation")
    marks = []
    base = 0
    for i in range(trace['𝜓_mean'].shape[1]):
        
        loc = trace['𝜓_mean'][:, i].mean()
        scale = trace['𝜓_std'][:, i].mean()
        interval = stats.norm.interval(0.995, loc, scale)
        points = np.linspace(interval[0], interval[1], 100)
        mark = axes.fill(
            points, 
            stats.norm.pdf(points, loc=loc, scale=scale), 
            style={
                "fill-opacity": 0.45,
                "stroke": 'black',
                "stroke-opacity": 1.0,
                "stroke-width": 1,
            },
            baseline=np.repeat(base, 100),
        )
        marks.append(mark)
        axes.hlines(base, style={"stroke-dasharray": "5,5", 'stroke-width': 1})
        base += baseline
        
    axes.y.show = False
    axes.x.ticks.locator = toyplot.locator.Extended(only_inside=True)
    axes.x.ticks.show = True
    return canvas, axes, marks

### Define models

In [22]:
def censored_pooled_regression(x, y, **kwargs):
    
    # data pre-processing
    lower_censored = y[y <= 0].index
    _x_lc = x[lower_censored].values
    _y_lc = y[lower_censored].values

    upper_censored = y[y >= 1].index
    _x_uc = x[upper_censored].values
    _y_uc = y[upper_censored].values

    uncensored = (y > 0) & (y < 1)
    _x = x[uncensored].values
    _y = y[uncensored].values
    
    # define model
    with pm.Model() as model:  

        # parameters and error
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        𝜎 = pm.HalfNormal('𝜎', 5.0, shape=1)

        # linear model prediction
        ri = (𝛼 + 𝛽 * _x) / (1 + 𝛽 * _x)

        # data likelihood (normal distributed errors)
        y = pm.Normal("y", mu=ri, sigma=𝜎, observed=_y)

        # density of censored data
        if sum(lower_censored):
            lcensored = pm.Potential(
                "lower_censored", 
                normal_lcdf((𝛼 + 𝛽 * _x_lc) / (1 + 𝛽 * _x_lc), 𝜎, _y_lc),
            )
        if sum(upper_censored):
            ucensored = pm.Potential(
                "upper_censored",
                normal_lccdf((𝛼 + 𝛽 * _x_uc) / (1 + 𝛽 * _x_uc), 𝜎, _y_uc),
            )

        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]
    
        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

In [25]:
def censored_unpooled_noncentered_regression(x, y, idx0, idx1, **kwargs):
    
    # data pre-processing
    lower_censored = y[y <= 0].index
    _x_lc = x[lower_censored].values
    _y_lc = y[lower_censored].values

    upper_censored = y[y >= 1].index
    _x_uc = x[upper_censored].values
    _y_uc = y[upper_censored].values

    uncensored = (y > 0) & (y < 1)
    _x = x[uncensored].values
    _y = y[uncensored].values
    
    # define model
    with pm.Model() as model:
        
        # censored indexers
        sidx0 = pm.Data("spp_idx0", idx0.values[uncensored])
        sidx1 = pm.Data("spp_idx1", idx1.values[uncensored])
        sidx0_u = pm.Data("sidx0_u", idx0.values[upper_censored])
        sidx1_u = pm.Data("sidx1_u", idx1.values[upper_censored])
        sidx0_l = pm.Data("sidx0_l", idx0.values[lower_censored])
        sidx1_l = pm.Data("sidx1_l", idx1.values[lower_censored])

        # parameters and error
        𝜓_mean = pm.Normal('𝜓_mean', mu=0., sigma=5., shape=1)
        𝜓_std = pm.HalfNormal('𝜓_std', 5., shape=1)
        𝜓_offset = pm.Normal('𝜓_offset', mu=0, sigma=1., shape=TREE.ntips)
        𝜓 = pm.Deterministic('𝜓', 𝜓_mean + 𝜓_std * 𝜓_offset)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        𝜎 = pm.HalfNormal('𝜎', 5., shape=1)
        
        # linear model prediction
        ri = (𝛼 + (𝛽 + 𝜓[sidx0] + 𝜓[sidx1]) * _x) / (1 + (𝛽 + 𝜓[sidx0] + 𝜓[sidx1]) * _x)

        # data likelihood (normal distributed errors)
        y = pm.Normal("y", mu=ri, sigma=𝜎, observed=_y)

        # density of censored data
        if sum(lower_censored):
            lcensored = pm.Potential(
                "lower_censored", 
                normal_lcdf((𝛼 + (𝛽 + 𝜓[sidx0_l] + 𝜓[sidx1_l]) * _x_lc) / (1 + (𝛽 + 𝜓[sidx0_l] + 𝜓[sidx1_l]) * _x_lc), 
                            𝜎, _y_lc),
            )
        
        if sum(upper_censored):
            ucensored = pm.Potential(
                "upper_censored",
                normal_lccdf((𝛼 + (𝛽 + 𝜓[sidx0_u] + 𝜓[sidx1_u]) * _x_uc) / (1 + (𝛽 + 𝜓[sidx0_u] + 𝜓[sidx1_u]) * _x_uc), 
                            𝜎, _y_uc),
            )
            

        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]

        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

In [26]:
def censored_partpooled_noncentered_regression(x, y, idx0, idx1, gidx, **kwargs):
    
    # data pre-processing
    lower_censored = y[y <= 0].index
    _x_lc = x[lower_censored].values
    _y_lc = y[lower_censored].values

    upper_censored = y[y >= 1].index
    _x_uc = x[upper_censored].values
    _y_uc = y[upper_censored].values

    uncensored = (y > 0) & (y < 1)
    _x = x[uncensored].values
    _y = y[uncensored].values
    
    # define model
    with pm.Model() as model:
        
        # censored indexers
        sidx0 = pm.Data("spp_idx0", idx0.values[uncensored])
        sidx1 = pm.Data("spp_idx1", idx1.values[uncensored])
        sidx0_u = pm.Data("sidx0_u", idx0.values[upper_censored])
        sidx1_u = pm.Data("sidx1_u", idx1.values[upper_censored])
        sidx0_l = pm.Data("sidx0_l", idx0.values[lower_censored])
        sidx1_l = pm.Data("sidx1_l", idx1.values[lower_censored])
        gidx = pm.Data("gidx", gidx)

        # parameters and error
        𝜓_mean = pm.Normal('𝜓_mean', mu=0., sigma=5., shape=4)
        𝜓_std = pm.HalfNormal('𝜓_std', 5., shape=4)
        𝜓_offset = pm.Normal('𝜓_offset', mu=0, sigma=1., shape=TREE.ntips)
        𝜓 = pm.Deterministic('𝜓', 𝜓_mean[gidx] + 𝜓_std[gidx] * 𝜓_offset)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        𝜎 = pm.HalfNormal('𝜎', 5., shape=1)
        
        # linear model prediction
        ri = (𝛼 + (𝛽 + 𝜓[sidx0] + 𝜓[sidx1]) * _x) / (1 + (𝛽 + 𝜓[sidx0] + 𝜓[sidx1]) * _x)

        # data likelihood (normal distributed errors)
        y = pm.Normal("y", mu=ri, sigma=𝜎, observed=_y)

        # density of censored data
        if sum(lower_censored):
            lcensored = pm.Potential(
                "lower_censored", 
                normal_lcdf((𝛼 + (𝛽 + 𝜓[sidx0_l] + 𝜓[sidx1_l]) * _x_lc) / (1 + (𝛽 + 𝜓[sidx0_l] + 𝜓[sidx1_l]) * _x_lc), 
                            𝜎, _y_lc),
            )
        
        if sum(upper_censored):
            ucensored = pm.Potential(
                "upper_censored",
                normal_lccdf((𝛼 + (𝛽 + 𝜓[sidx0_u] + 𝜓[sidx1_u]) * _x_uc) / (1 + (𝛽 + 𝜓[sidx0_u] + 𝜓[sidx1_u]) * _x_uc), 
                            𝜎, _y_uc),
            )

        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]

        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

### Run models

In [27]:
# MCMC sampler kwargs
sample_kwargs = dict(
    tune=4000,
    draws=4000,
    target_accept=0.95,
    return_inferencedata=False,
    progressbar=True,
)

In [28]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_pooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# pooled model
pooled_sub = censored_pooled_regression(*model_args[:2], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝜎, 𝛽, 𝛼]


Sampling 4 chains for 4_000 tune and 4_000 draw iterations (16_000 + 16_000 draws total) took 46 seconds.


In [29]:
pooled_sub['stats']

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
𝛼[0],0.054,0.01,0.035,0.071,0.0,0.0,4214.0,4193.0,4220.0,4470.0,1.0
𝛽[0],1.473,0.034,1.408,1.535,0.001,0.0,4222.0,4220.0,4231.0,4632.0,1.0
𝜎[0],0.052,0.001,0.05,0.054,0.0,0.0,5753.0,5708.0,5773.0,5008.0,1.0


In [30]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_unpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# unpooled model
unpooled_sub = censored_unpooled_noncentered_regression(*model_args[:4], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝜎, 𝛼, 𝛽, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 4_000 tune and 4_000 draw iterations (16_000 + 16_000 draws total) took 4405 seconds.
There were 3070 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 1153 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 360 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 3450 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effe

In [31]:
unpooled_sub['stats']

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
𝜓_mean[0],-0.795,3.310,-6.404,6.060,1.256,0.928,7.0,7.0,7.0,24.0,1.56
𝜓_offset[0],0.130,0.375,-0.587,0.490,0.185,0.141,4.0,4.0,5.0,24.0,2.21
𝜓_offset[1],0.090,0.709,-0.647,1.295,0.353,0.270,4.0,4.0,5.0,19.0,2.64
𝜓_offset[2],-0.082,0.518,-0.731,0.803,0.256,0.196,4.0,4.0,4.0,15.0,3.50
𝜓_offset[3],0.649,0.403,0.073,1.189,0.196,0.150,4.0,4.0,5.0,29.0,2.49
...,...,...,...,...,...,...,...,...,...,...,...
𝜓[76],-0.853,4.584,-8.095,7.054,2.031,1.527,5.0,5.0,5.0,14.0,2.07
𝜓[77],-1.651,5.833,-12.922,6.612,2.695,2.039,5.0,5.0,5.0,14.0,2.46
𝜓[78],2.986,3.788,-5.302,9.020,1.389,1.024,7.0,7.0,8.0,48.0,1.49
𝜓[79],-0.668,4.251,-7.778,6.278,1.840,1.380,5.0,5.0,6.0,16.0,1.85


In [32]:
toytrace(unpooled_sub['trace'], ['𝜓_mean', '𝜓_offset', '𝜓'], ['psi-mean', 'psi-offset', 'psi-spp']);

In [33]:
# show plot of TRUE vs. ESTIMATED rates
c, a, m = toyplot.scatterplot(
    unpooled_sub['trace']['𝜓'].mean(axis=0),         # estimated
    SPECIES_DATA['psi'],                             # true
    width=400,
    height=250,
    xlabel="ESTIMATED species velocity",
    ylabel="TRUE species velocity",
);

In [34]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_partpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# unpooled model
partpooled_sub = censored_partpooled_noncentered_regression(*model_args, **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝜎, 𝛼, 𝛽, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 4_000 tune and 4_000 draw iterations (16_000 + 16_000 draws total) took 3070 seconds.
There were 3998 divergences after tuning. Increase `target_accept` or reparameterize.
There were 2603 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 3982 divergences after tuning. Increase `target_accept` or reparameterize.
There were 3984 divergences after tuning. Increase `target_accept` or reparameterize.
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.


In [35]:
toytrace(partpooled_sub['trace'], ['𝜓_mean', '𝜓_offset', '𝜓'], ['psi-mean', 'psi-offset', 'psi-spp']);

In [36]:
# show plot of TRUE vs. ESTIMATED rates
c, a, m = toyplot.scatterplot(
    partpooled_sub['trace']['𝜓'].mean(axis=0),         # estimated
    SPECIES_DATA['psi_x'],                             # true
    width=400,
    height=250,
    xlabel="ESTIMATED species velocity",
    ylabel="TRUE species velocity",
    color=[toyplot.color.Palette()[i] for i in SPECIES_DATA.gidx],
);

In [45]:
draw_velocity_dists(partpooled_sub['trace'], baseline = 0.1);

### Assess model fit

In [38]:
def rmse(predictions, targets):
    differences = predictions - targets                       #the DIFFERENCEs.
    differences_squared = differences ** 2                    #the SQUAREs of ^
    mean_of_differences_squared = differences_squared.mean()  #the MEAN of ^
    rmse_val = np.sqrt(mean_of_differences_squared)           #ROOT of ^
    return rmse_val                                           #get the ^

In [39]:
def aicm(mean, variance):
    return 2*mean - 2*variance

In [40]:
rmse(unpooled_sub['trace']['𝜓'].mean(axis=0), SPECIES_DATA['psi'])

2.410852838638037

In [41]:
rmse(partpooled_sub['trace']['𝜓'].mean(axis=0), SPECIES_DATA['psi_x'])

2.537470515573881

In [42]:
aicm(unpooled_sub['trace']['𝜓'].mean(), unpooled_sub['trace']['𝜓'].var())

-68.48522152338049

In [43]:
aicm(partpooled_sub['trace']['𝜓'].mean(), partpooled_sub['trace']['𝜓'].var())

-41.54120542064919