In [1]:
import itertools
import pandas as pd
import numpy as np
import toytree
import toyplot
import arviz as az
import pymc3 as pm

### Tree and dataframe setup

In [2]:
# generate a random tree
NSPECIES = 80
TREE = toytree.rtree.bdtree(
    ntips=NSPECIES,
    seed=666,
).mod.node_scale_root_height(1.0)

# node idxs that delimit several distinct clades on this tree
CLADES = [152, 153, 154, 155]

# draw and color the four major clades
TREE.draw(
    layout='d', 
    width=500,
    tip_labels=False,
    edge_colors=TREE.get_edge_values_mapped({
        j: toytree.colors[i] for i, j in enumerate(CLADES)
    }),
    scalebar=True,
);

In [3]:
# make group index (gidx)
crown_dict = {i: TREE.get_tip_labels(i) for i in CLADES}
gidx = np.zeros(TREE.ntips, dtype=int)
for tidx, tip in enumerate(TREE.get_tip_labels()):
    for cidx, clade in enumerate(crown_dict):
        if tip in crown_dict[clade]:
            gidx[tidx] = cidx
gidx

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [4]:
# True param values
𝛼_mean = 0.05
𝛼_std = 0.02
𝛽_mean = 3.0
𝛽_std = 0.2
𝜓_mean = 0.0
𝜓_std = 0.33

# 4 different clade effects on rate of RI (used for partial-pooling data)
𝜓_0_mean = 1.0
𝜓_0_std = 0.1
𝜓_1_mean = 0.5
𝜓_1_std = 0.05
𝜓_2_mean = -0.5
𝜓_2_std = 0.05
𝜓_3_mean = -1.0
𝜓_3_std = 0.1

In [5]:
# species dataframe
SPECIES_DATA = pd.DataFrame({
    "gidx": gidx,
    "b": np.random.normal(𝛽_mean, 𝛽_std, TREE.ntips),
    "psi": np.random.normal(𝜓_mean, 𝜓_std, TREE.ntips),
    "psi_x": np.concatenate([
        np.random.normal(𝜓_0_mean, 𝜓_0_std, len(gidx[gidx == 0])),
        np.random.normal(𝜓_1_mean, 𝜓_1_std, len(gidx[gidx == 1])),
        np.random.normal(𝜓_2_mean, 𝜓_2_std, len(gidx[gidx == 2])),
        np.random.normal(𝜓_3_mean, 𝜓_3_std, len(gidx[gidx == 3])),
    ]),
})
SPECIES_DATA.head()

Unnamed: 0,gidx,b,psi,psi_x
0,0,2.570468,-0.236524,0.997529
1,0,3.292698,-0.624017,0.863576
2,0,2.936227,-0.612795,0.956139
3,0,2.746916,-0.234236,1.115301
4,0,2.767121,0.03127,0.981499


### Generate crossing data

In [6]:
def get_dist(tree, idx0, idx1):
    "returns the genetic distance between two nodes on a tree"
    dist = tree.treenode.get_distance(
        tree.idx_dict[idx0], 
        tree.idx_dict[idx1],
    )
    return dist

# get all combinations of two sampled taxa
a, b = zip(*itertools.combinations(range(NSPECIES), 2))

# organize into DF and get genetic distance between pairs
DATA = pd.DataFrame({
    "sidx0": a,
    "sidx1": b,
    "dist": [(get_dist(TREE, i, j) / 2) for (i, j) in zip(a, b)],
})

DATA['b'] = np.random.normal(𝛽_mean, 𝛽_std, DATA.shape[0])
DATA['velo'] = (
    DATA['b']
    + SPECIES_DATA['psi'][DATA.sidx0].values
    + SPECIES_DATA['psi'][DATA.sidx1].values
)
DATA['velo_x'] = (
    DATA['b']
    + SPECIES_DATA['psi_x'][DATA.sidx0].values
    + SPECIES_DATA['psi_x'][DATA.sidx1].values
)
DATA['intercept'] = np.random.normal(𝛼_mean, 𝛼_std, DATA.shape[0])
# DATA['error'] = np.random.normal(0.0, 𝜎_std, DATA.shape[0])

# get logits
DATA['logit_b'] = (
    1 / (1 + np.exp(-(DATA.b * DATA.dist ** 2 + DATA.intercept * DATA.dist)))
)
DATA['logit'] = (
    1 / (1 + np.exp(-(DATA.velo * DATA.dist ** 2 + DATA.intercept * DATA.dist)))
)
DATA['logit_x'] = (
    1 / (1 + np.exp(-(DATA.velo_x * DATA.dist ** 2 + DATA.intercept * DATA.dist)))
)

# get RI estimates
DATA['RI_pooled'] = np.random.binomial(n=1, p=DATA.logit_b / DATA.logit_b.max())
DATA['RI_unpooled'] = np.random.binomial(n=1, p=DATA.logit / DATA.logit.max())
DATA['RI_partpooled'] = np.random.binomial(n=1, p=DATA.logit_x / DATA.logit_x.max())

DATA.head()

Unnamed: 0,sidx0,sidx1,dist,b,velo,velo_x,intercept,logit_b,logit,logit_x,RI_pooled,RI_unpooled,RI_partpooled
0,0,1,0.073376,2.881579,2.021039,4.742684,0.023752,0.504314,0.503156,0.506819,0,0,0
1,0,2,0.089748,2.638508,1.789189,4.592176,0.074466,0.506983,0.505273,0.510916,1,1,0
2,0,3,0.12977,3.113867,2.643107,5.226697,0.055129,0.514894,0.512913,0.523775,0,1,0
3,0,4,0.144542,3.059197,2.853944,5.038226,0.059521,0.518121,0.517051,0.528435,0,0,1
4,0,5,0.144542,2.620887,2.866354,4.699434,0.059066,0.515818,0.517099,0.526655,0,0,0


In [7]:
NSAMPLES = 2000
SAMPLE = DATA.sample(NSAMPLES).copy().reset_index(drop=True)
SAMPLE.head()

Unnamed: 0,sidx0,sidx1,dist,b,velo,velo_x,intercept,logit_b,logit,logit_x,RI_pooled,RI_unpooled,RI_partpooled
0,60,63,0.578689,2.92144,2.625589,1.913308,0.044766,0.731892,0.71201,0.66075,1,1,1
1,46,74,0.59272,3.28733,3.246154,1.886728,0.043472,0.765067,0.762457,0.665657,0,0,0
2,23,30,0.438896,2.615066,2.010742,4.803178,0.057368,0.629231,0.601689,0.721198,1,0,1
3,61,71,0.59272,3.160457,2.911351,1.77862,0.020785,0.754479,0.737909,0.654116,1,1,1
4,23,34,0.590047,3.069562,2.832857,4.990309,0.039344,0.748739,0.73292,0.853287,1,1,1


### Visualize data

In [8]:
def logit_plot(dist, logit, RI):
    canvas = toyplot.Canvas(width=500, height=250)
    ax0 = canvas.cartesian(
        label="pooled data (function)",
        xlabel="Genetic dist.",
        ylabel="Logit function",
        grid=(1, 2, 0),
    )
    ax1 = canvas.cartesian(
        label="pooled data (observation)",
        xlabel="Genetic dist.",
        ylabel="RI",
        grid=(1, 2, 1),
    )

    # points are jittered on x-axis for visibility
    ax0.scatterplot(
        dist,
        logit,
        size=5,
        opacity=0.33,
        color=toyplot.color.Palette()[0],
    );
    ax1.scatterplot(
        dist,
        RI,
        size=10,
        opacity=0.2,
        marker="|",
        mstyle={
            "stroke": toyplot.color.Palette()[1],
            "stroke-width": 3,
        },
    );
    return canvas, (ax0, ax1)

In [9]:
logit_plot(SAMPLE.dist, SAMPLE.logit_b, SAMPLE.RI_pooled);

In [10]:
logit_plot(SAMPLE.dist, SAMPLE.logit, SAMPLE.RI_unpooled);

In [11]:
logit_plot(SAMPLE.dist, SAMPLE.logit_x, SAMPLE.RI_partpooled);

### Define models

In [12]:
def pooled_logistic(x, y, **kwargs):
    
    # define model
    with pm.Model() as model:  

        # parameters and error
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        
        # link function
        effect = (𝛽 * x ** 2) + 𝛼 * x
        logit = pm.Deterministic("logit", pm.invlogit(effect))
        
        # data likelihood
        y = pm.Bernoulli("y", p=logit, observed=y)
        
        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]
    
        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

In [13]:
def unpooled_logistic(x, y, idx0, idx1, **kwargs):
    
    # define model
    with pm.Model() as model:
        
        # indexers
        sidx0 = pm.Data("spp_idx0", idx0.values)
        sidx1 = pm.Data("spp_idx1", idx1.values)

        # parameters and error
        𝜓_mean = pm.Normal('𝜓_mean', mu=0., sigma=5., shape=1)
        𝜓_std = pm.HalfNormal('𝜓_std', 5., shape=1)
        𝜓_offset = pm.Normal('𝜓_offset', mu=0, sigma=1., shape=TREE.ntips)
        𝜓 = pm.Deterministic('𝜓', 𝜓_mean + 𝜓_std * 𝜓_offset)
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        
        # link function
        effect = ((𝛽 + 𝜓[sidx0] + 𝜓[sidx1]) * x ** 2) + 𝛼 * x
        logit = pm.Deterministic("logit", pm.invlogit(effect))
        
        # data likelihood
        y = pm.Bernoulli("y", p=logit, observed=y)
        
        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]
    
        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

In [14]:
def partpooled_logistic(x, y, idx0, idx1, gidx, **kwargs):
    
    # define model
    with pm.Model() as model:
        
        # indexers
        sidx0 = pm.Data("spp_idx0", idx0)
        sidx1 = pm.Data("spp_idx1", idx1)
        gidx = pm.Data("gidx", gidx)

        # parameters and error
        𝜓_mean = pm.Normal('𝜓_mean', mu=0., sigma=5., shape=4)
        𝜓_std = pm.HalfNormal('𝜓_std', 5., shape=4)
        𝜓_offset = pm.Normal('𝜓_offset', mu=0, sigma=1., shape=TREE.ntips)
        𝜓 = pm.Deterministic('𝜓', 𝜓_mean[gidx] + 𝜓_std[gidx] * 𝜓_offset)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        
        # linear model prediction
        effect = ((𝛽 + 𝜓[sidx0] + 𝜓[sidx1]) * x ** 2) + 𝛼 * x
        logit = pm.Deterministic("logit", pm.invlogit(effect))
        
        # data likelihood (normal distributed errors)
        y = pm.Bernoulli("y", p=logit, observed=y)

        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]

        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

### Functions to plot results

In [15]:
def toytrace(trace, var_names, titles):
    """
    Plot posterior trace with toyplot
    """
    nvars = len(var_names)
    
    # setup canvase
    canvas = toyplot.Canvas(width=500, height=200 * nvars)
    
    # store axes
    axes = []
    
    # iter over params
    for pidx, param in enumerate(var_names):
        
        # get param posterior
        posterior = trace.get_values(param)
        
        # setup axes 
        ax = canvas.cartesian(grid=(nvars, 1, pidx))
        ax.y.show = False
        ax.x.spine.style = {"stroke-width": 1.5}
        ax.x.ticks.labels.style = {"font-size": "12px"}
        ax.x.ticks.show = True
        ax.x.label.text = f"param='{titles[pidx]}'"        
        
        # iterate over shape of param
        for idx in range(posterior.shape[1]):
            mags, bins = np.histogram(posterior[:, idx], bins=100)
            ax.plot(bins[1:], mags, stroke_width=2, opacity=0.6)
        axes.append(ax)
    return canvas, axes

In [16]:
import scipy.stats as stats

def draw_velocity_dists(trace, baseline=0.15):
    """
    Draw the clade velocities as gaussians
    """
    canvas = toyplot.Canvas(width=350, height=300)
    axes = canvas.cartesian(xlabel="Relative velocity of reproductive isolation")
    marks = []
    base = 0
    for i in range(trace['𝜓_mean'].shape[1]):
        
        loc = trace['𝜓_mean'][:, i].mean()
        scale = trace['𝜓_std'][:, i].mean()
        interval = stats.norm.interval(0.995, loc, scale)
        points = np.linspace(interval[0], interval[1], 100)
        mark = axes.fill(
            points, 
            stats.norm.pdf(points, loc=loc, scale=scale), 
            style={
                "fill-opacity": 0.45,
                "stroke": 'black',
                "stroke-opacity": 1.0,
                "stroke-width": 1,
            },
            baseline=np.repeat(base, 100),
        )
        marks.append(mark)
        axes.hlines(base, style={"stroke-dasharray": "5,5", 'stroke-width': 1})
        base += baseline
        
    axes.y.show = False
    axes.x.ticks.locator = toyplot.locator.Extended(only_inside=True)
    axes.x.ticks.show = True
    return canvas, axes, marks

In [17]:
# MCMC sampler kwargs
sample_kwargs = dict(
    tune=10000,
    draws=10000,
    target_accept=0.99,
    return_inferencedata=False,
    progressbar=True,
)

### Run three datasets under pooled model

In [18]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_pooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# pooled model
pooled_model_pooled_data = pooled_logistic(*model_args[:2], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛽, 𝛼]


Sampling 4 chains for 10_000 tune and 10_000 draw iterations (40_000 + 40_000 draws total) took 416 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [44]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_unpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# pooled model
pooled_model_unpooled_data = pooled_logistic(*model_args[:2], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛽, 𝛼]


Sampling 4 chains for 10_000 tune and 10_000 draw iterations (40_000 + 40_000 draws total) took 352 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [None]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_partpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# pooled model
pooled_model_partpooled_data = pooled_logistic(*model_args[:2], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛽, 𝛼]


### Run three datasets under unpooled model

In [None]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_pooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# unpooled model
unpooled_model_pooled_data = unpooled_logistic(*model_args[:4], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛽, 𝛼, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 10_000 tune and 10_000 draw iterations (40_000 + 40_000 draws total) took 14372 seconds.


In [31]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_unpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# unpooled model
unpooled_model_unpooled_data  = unpooled_logistic(*model_args[:4], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛽, 𝛼, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 10_000 tune and 10_000 draw iterations (40_000 + 40_000 draws total) took 18129 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [33]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_partpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# unpooled model
unpooled_model_partpooled_data = unpooled_logistic(*model_args[:4], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛽, 𝛼, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 10_000 tune and 10_000 draw iterations (40_000 + 40_000 draws total) took 14133 seconds.
The number of effective samples is smaller than 25% for some parameters.


### Run three datasets under partpooled model

In [36]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_pooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# unpooled model
partpooled_model_pooled_data = partpooled_logistic(*model_args, **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛼, 𝛽, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 10_000 tune and 10_000 draw iterations (40_000 + 40_000 draws total) took 6540 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [38]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_unpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# unpooled model
partpooled_model_unpooled_data = partpooled_logistic(*model_args, **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛼, 𝛽, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 10_000 tune and 10_000 draw iterations (40_000 + 40_000 draws total) took 8877 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [19]:
# model input
model_args = [
    SAMPLE.dist,
    SAMPLE.RI_partpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

# unpooled model
partpooled_model_partpooled_data = partpooled_logistic(*model_args, **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝛼, 𝛽, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 10_000 tune and 10_000 draw iterations (40_000 + 40_000 draws total) took 7561 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [20]:
partpooled_model_partpooled_data['stats']

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
𝜓_mean[0],1.577,2.248,-2.620,5.783,0.029,0.021,5821.0,5821.0,5821.0,11576.0,1.0
𝜓_mean[1],0.590,2.254,-3.522,4.943,0.029,0.021,5900.0,5900.0,5900.0,11948.0,1.0
𝜓_mean[2],0.119,2.255,-4.205,4.257,0.029,0.021,5893.0,5893.0,5892.0,12005.0,1.0
𝜓_mean[3],-0.648,2.253,-4.897,3.542,0.029,0.021,5875.0,5875.0,5875.0,11874.0,1.0
𝜓_offset[0],0.172,0.966,-1.702,1.938,0.005,0.005,37948.0,18701.0,37957.0,27832.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
logit[1995],0.737,0.029,0.683,0.792,0.000,0.000,35488.0,34896.0,36272.0,29696.0,1.0
logit[1996],0.566,0.019,0.531,0.601,0.000,0.000,42619.0,42619.0,42615.0,30446.0,1.0
logit[1997],0.635,0.056,0.529,0.742,0.000,0.000,40312.0,40312.0,40542.0,30218.0,1.0
logit[1998],0.672,0.058,0.567,0.787,0.000,0.000,33788.0,33287.0,33950.0,29849.0,1.0


In [21]:
toytrace(partpooled_model_partpooled_data['trace'], ['𝜓_mean', '𝜓_offset', '𝜓'], ['psi-mean', 'psi-offset', 'psi-spp']);

In [22]:
# show plot of TRUE vs. ESTIMATED rates
c, a, m = toyplot.scatterplot(
    partpooled_model_partpooled_data['trace']['𝜓'].mean(axis=0),         # estimated
    SPECIES_DATA['psi_x'],                             # true
    width=400,
    height=250,
    xlabel="ESTIMATED species velocity",
    ylabel="TRUE species velocity",
    color=[toyplot.color.Palette()[i] for i in SPECIES_DATA.gidx],
);

In [53]:
stats.linregress(partpooled_model_partpooled_data['trace']['𝜓'].mean(axis=0), SPECIES_DATA['psi_x'])

LinregressResult(slope=0.8996203153771799, intercept=-0.4165166943064579, rvalue=0.9550366624726686, pvalue=6.172818681506917e-43, stderr=0.03162267701105342)

In [23]:
draw_velocity_dists(partpooled_model_partpooled_data['trace'], baseline = 1.2);

### Assess model fit

In [30]:
def rmse(predictions, targets):
    differences = predictions - targets                       #the DIFFERENCEs.
    differences_squared = differences ** 2                    #the SQUAREs of ^
    mean_of_differences_squared = differences_squared.mean()  #the MEAN of ^
    rmse_val = np.sqrt(mean_of_differences_squared)           #ROOT of ^
    return rmse_val

In [31]:
def aicm(mean, variance):
    return 2*mean - 2*variance

In [32]:
rmse(unpooled_sub['trace']['𝜓'].mean(axis=0), SPECIES_DATA['psi'])

0.9292188416833755

In [33]:
rmse(partpooled_sub['trace']['𝜓'].mean(axis=0), SPECIES_DATA['psi_x'])

0.6797919539022527

In [34]:
aicm(unpooled_sub['trace']['𝜓'].mean(), unpooled_sub['trace']['𝜓'].var())

-23.831186286890336

In [35]:
# Raftery et al. (2007) holds that this value is better.  R+M disagree?
aicm(partpooled_sub['trace']['𝜓'].mean(), partpooled_sub['trace']['𝜓'].var())

-11.997795212343398

In [45]:
az_pooled_model_pooled_data = az.from_pymc3(trace = pooled_model_pooled_data['trace'], 
                                            model = pooled_model_pooled_data['model'])
az_pooled_model_unpooled_data = az.from_pymc3(trace = pooled_model_unpooled_data['trace'], 
                                            model = pooled_model_unpooled_data['model'])
az_pooled_model_partpooled_data = az.from_pymc3(trace = pooled_model_partpooled_data['trace'], 
                                            model = pooled_model_partpooled_data['model'])
az_unpooled_model_pooled_data = az.from_pymc3(trace = unpooled_model_pooled_data['trace'], 
                                            model = unpooled_model_pooled_data['model'])
az_unpooled_model_unpooled_data = az.from_pymc3(trace = unpooled_model_unpooled_data['trace'], 
                                            model = unpooled_model_unpooled_data['model'])
az_unpooled_model_partpooled_data = az.from_pymc3(trace = unpooled_model_partpooled_data['trace'], 
                                            model = unpooled_model_partpooled_data['model'])
az_partpooled_model_pooled_data = az.from_pymc3(trace = partpooled_model_pooled_data['trace'], 
                                            model = partpooled_model_pooled_data['model'])
az_partpooled_model_unpooled_data = az.from_pymc3(trace = partpooled_model_unpooled_data['trace'], 
                                            model = partpooled_model_unpooled_data['model'])
az_partpooled_model_partpooled_data = az.from_pymc3(trace = partpooled_model_partpooled_data['trace'], 
                                            model = partpooled_model_partpooled_data['model'])

### Compare models with different datasets

In [46]:
az.compare({"pooled_model_pooled_data": az_pooled_model_pooled_data,
            "pooled_model_unpooled_data": az_pooled_model_unpooled_data, 
            "pooled_model_partpooled_data": az_pooled_model_partpooled_data})

The scale is now log by default. Use 'scale' argument or 'stats.ic_scale' rcParam if you rely on a specific value.
A higher log-score (or a lower deviance) indicates a model with better predictive accuracy.
  "\nThe scale is now log by default. Use 'scale' argument or "


Unnamed: 0,rank,loo,p_loo,d_loo,weight,se,dse,warning,loo_scale
pooled_model_pooled_data,0,-702.378,2.06549,0.0,0.765127,25.3411,0.0,False,log
pooled_model_partpooled_data,1,-725.945,1.98954,23.5669,0.230179,24.9677,32.0085,False,log
pooled_model_unpooled_data,2,-773.488,2.05086,71.1099,0.00469398,24.6806,32.626,False,log


In [47]:
az.compare({"unpooled_model_pooled_data": az_unpooled_model_pooled_data,
            "unpooled_model_unpooled_data": az_unpooled_model_unpooled_data, 
            "unpooled_model_partpooled_data": az_unpooled_model_partpooled_data})

The scale is now log by default. Use 'scale' argument or 'stats.ic_scale' rcParam if you rely on a specific value.
A higher log-score (or a lower deviance) indicates a model with better predictive accuracy.
  "\nThe scale is now log by default. Use 'scale' argument or "


Unnamed: 0,rank,loo,p_loo,d_loo,weight,se,dse,warning,loo_scale
unpooled_model_pooled_data,0,-701.455,20.3711,0.0,0.736811,24.3943,0.0,False,log
unpooled_model_partpooled_data,1,-722.43,24.5418,20.9746,0.259126,25.4337,32.1201,False,log
unpooled_model_unpooled_data,2,-773.176,15.8619,71.7212,0.00406259,25.4249,32.4747,False,log


In [48]:
az.compare({"partpooled_model_pooled_data": az_partpooled_model_pooled_data,
            "partpooled_model_unpooled_data": az_partpooled_model_unpooled_data, 
            "partpooled_model_partpooled_data": az_partpooled_model_partpooled_data})

The scale is now log by default. Use 'scale' argument or 'stats.ic_scale' rcParam if you rely on a specific value.
A higher log-score (or a lower deviance) indicates a model with better predictive accuracy.
  "\nThe scale is now log by default. Use 'scale' argument or "


Unnamed: 0,rank,loo,p_loo,d_loo,weight,se,dse,warning,loo_scale
partpooled_model_pooled_data,0,-703.59,29.7137,0.0,0.542956,26.1819,0.0,False,log
partpooled_model_partpooled_data,1,-708.151,19.3844,4.56058,0.45692,25.2337,32.344,False,log
partpooled_model_unpooled_data,2,-776.279,26.2139,72.6886,0.000124287,25.1375,32.9638,False,log


### Compare datasets with different models

In [54]:
az.compare({"pooled_model_pooled_data": az_pooled_model_pooled_data,
            "unpooled_model_pooled_data": az_unpooled_model_pooled_data, 
            "partpooled_model_pooled_data": az_partpooled_model_pooled_data})

The scale is now log by default. Use 'scale' argument or 'stats.ic_scale' rcParam if you rely on a specific value.
A higher log-score (or a lower deviance) indicates a model with better predictive accuracy.
  "\nThe scale is now log by default. Use 'scale' argument or "


Unnamed: 0,rank,loo,p_loo,d_loo,weight,se,dse,warning,loo_scale
unpooled_model_pooled_data,0,-701.455,20.3711,0.0,0.496582,24.8633,0.0,False,log
pooled_model_pooled_data,1,-702.378,2.06549,0.922622,0.337148,24.9655,2.11963,False,log
partpooled_model_pooled_data,2,-703.59,29.7137,2.13495,0.16627,25.2935,1.85016,False,log


In [55]:
az.compare({"pooled_model_unpooled_data": az_pooled_model_unpooled_data,
            "unpooled_model_unpooled_data": az_unpooled_model_unpooled_data, 
            "partpooled_model_unpooled_data": az_partpooled_model_unpooled_data})

The scale is now log by default. Use 'scale' argument or 'stats.ic_scale' rcParam if you rely on a specific value.
A higher log-score (or a lower deviance) indicates a model with better predictive accuracy.
  "\nThe scale is now log by default. Use 'scale' argument or "


Unnamed: 0,rank,loo,p_loo,d_loo,weight,se,dse,warning,loo_scale
unpooled_model_unpooled_data,0,-773.176,15.8619,0.0,0.469666,25.6495,0.0,False,log
pooled_model_unpooled_data,1,-773.488,2.05086,0.311349,0.452417,25.5891,1.57715,False,log
partpooled_model_unpooled_data,2,-776.279,26.2139,3.10243,0.0779171,25.9551,1.84647,False,log


In [56]:
az.compare({"pooled_model_partpooled_data": az_pooled_model_partpooled_data,
            "unpooled_model_partpooled_data": az_unpooled_model_partpooled_data, 
            "partpooled_model_partpooled_data": az_partpooled_model_partpooled_data})

The scale is now log by default. Use 'scale' argument or 'stats.ic_scale' rcParam if you rely on a specific value.
A higher log-score (or a lower deviance) indicates a model with better predictive accuracy.
  "\nThe scale is now log by default. Use 'scale' argument or "


Unnamed: 0,rank,loo,p_loo,d_loo,weight,se,dse,warning,loo_scale
partpooled_model_partpooled_data,0,-708.151,19.3844,0.0,0.988806,25.5329,0.0,False,log
unpooled_model_partpooled_data,1,-722.43,24.5418,14.2791,0.00509296,25.644,5.44499,False,log
pooled_model_partpooled_data,2,-725.945,1.98954,17.794,0.00610141,25.5736,7.08617,False,log


### other stuff

In [50]:
# Save trace.  To load in different notebook, model context and sample data is required.
pm.save_trace(partpooled_model_partpooled_data['trace'], directory = "/home/henry/oaks-thesis/trace/quadratic-pmpd")

'/home/henry/oaks-thesis/trace/quadratic-pmpd'

In [51]:
quadratic_args = [
    SAMPLE.dist,
    SAMPLE.RI_partpooled,
    SAMPLE.sidx0,
    SAMPLE.sidx1,
    gidx
]

In [52]:
%store quadratic_args

Stored 'quadratic_args' (list)
