TCC>TTC pulse timing
==

In [None]:
%matplotlib inline
from mushi import kSFS
from histories import eta
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import msprime
import stdpopsim

In [None]:
# plt.style.use('dark_background')

In [None]:
# set this to e.g. your Downloads folder path if you want plots saved to pdfs
plot_dir = None

EUR populations

In [None]:
pops = {'CEU', 'FIN', 'GBR', 'IBS', 'TSI'}

load 3-SFS for each population, and plot the total SFS

In [None]:
ksfs_dict = {}
plt.figure(figsize=(3, 3))
for pop in pops:
    ksfs_df = pd.read_csv(f'example_data/{pop}/3-SFS.tsv', sep='\t', index_col=0)
    ksfs_dict[pop] = kSFS(X=ksfs_df.values, mutation_types=ksfs_df.columns)
    ksfs_dict[pop].plot_total(label=pop)
plt.legend()
plt.show()

clip high frequencies due to ancestral state misidentification

In [None]:
clip_low = 0
clip_high = 10
# we need a different mask vector for each population becuase the number of haplotypes n
# (length of SFS vector) varies
freq_mask = {}
for pop in pops:
    freq_mask[pop] = np.array([True if (clip_low <= i < ksfs_dict[pop].n - clip_high - 1) else False
                               for i in range(ksfs_dict[pop].n - 1)])

time grid of epoch boundaries (measured in generations)

In [None]:
change_points = np.logspace(np.log10(1), np.log10(200000), 200)

masked genome size (excluding conserved sites, repeats, 1KG stict mask, and uncertain ancestral states)

In [None]:
with open('example_data/masked_size.tsv') as f:
    masked_genome_size = int(f.read())

mutation rate per site per generation

In [None]:
u = 1.3e-8

mutation rate per masked genome per generation

In [None]:
mu0 = u * masked_genome_size

generation time for time calibration

In [None]:
t_gen = 29

regularization paramaters and convergence criteria

In [None]:
regularization_eta = dict(alpha_tv=1e2, alpha_spline=5e3, alpha_ridge=1e-10)
regularization_mu = dict(hard=True, beta_rank=0, beta_tv=7e1, beta_ridge=1e-10)
convergence = dict(tol=1e-10, max_iter=1000)

fit $\eta(t)$

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
for idx, pop in enumerate(pops):
    print(pop)
    # clear solutions, in case rerunning this cell
    ksfs_dict[pop].clear_eta()
    ksfs_dict[pop].clear_mu()
    ksfs_dict[pop].infer_history(change_points, mu0, infer_mu=False,
                                 loss='prf', **regularization_eta,
                                 **convergence, mask=freq_mask[pop])
    plt.sca(axes[0])
    ksfs_dict[pop].plot_total(label=pop)
    plt.sca(axes[1])
    ksfs_dict[pop].eta.plot(lw=3, label=pop, t_gen=t_gen)
    plt.xlim([1e3, 1e6])
    plt.legend()
if plot_dir:
    plt.savefig(f'{plot_dir}/europulse.eta.pdf')
plt.show()

loop over a few values of the `beta_spline` parameter, which controls how much L2 smoothness is imposed on the first derivative, and fit $\mu(t)$

In [None]:
beta_spline_array = (1e-20, 1e3, 1e4)
fig, axes = plt.subplots(len(beta_spline_array), 2,
                         sharex='col', figsize=(8, 3 * len(beta_spline_array)))
for idx, beta_spline in enumerate(beta_spline_array):
    print(f'beta_spline = {beta_spline}')
    for idx2, pop in enumerate(pops):
        print(pop)
        # clear solution, in case rerunning this cell
        ksfs_dict[pop].clear_mu()
        ksfs_dict[pop].infer_history(change_points, mu0, beta_spline=beta_spline, infer_eta=False,
                                     loss='prf', **regularization_mu,
                                     **convergence, mask=freq_mask[pop])
        plt.sca(axes[idx, 0])
        ksfs_dict[pop].plot('TCC>TTC', clr=True,
                            label=(pop if idx == 0 else None),
                            lw=3, alpha=0.5, c=f'C{idx2}')
        plt.xscale('log')
        if idx == 0:
            plt.legend()
        if idx < axes.shape[0] - 1:
            plt.xlabel(None)
        plt.sca(axes[idx, 1])
        ksfs_dict[pop].μ.plot(('TCC>TTC',), t_gen=t_gen, clr=True,
                              label=(pop if idx == 0 else None),
                              lw=3, alpha=0.5)
        plt.xscale('log')
        if idx < axes.shape[0] - 1:
            plt.xlabel(None)        
        plt.xlim([1e3, 1e6])
plt.tight_layout()
if plot_dir:
    plt.savefig(f'{plot_dir}/europulse.mu.pdf', dpi=300)
plt.show()

Instead of fitting the demography, we can use the demography of Tennessen et al. from `stdpopsim`

In [None]:
species = stdpopsim.get_species("HomSap")
model = species.get_demographic_model("OutOfAfrica_2T12")
ddb = model.get_demography_debugger()
steps = np.concatenate((np.array([0]), change_points))
eta_Tennessen = eta(change_points,
                1 / ddb.coalescence_rate_trajectory(steps=steps,
                                                    num_samples=[0, 2],
                                                    double_step_validation=False)[0])
plt.figure(figsize=(4, 4))
eta_Tennessen.plot(t_gen=t_gen, label='EUR (Tennessen et al.)')
plt.xlim([1e3, 1e6])
plt.legend()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3,
                         sharex='col', figsize=(12, 4))
for idx, pop in enumerate(pops):
    print(pop)
    ksfs_dict[pop].infer_history(change_points, mu0, eta=eta_Tennessen, beta_spline=1e-20,
                                 loss='prf', **regularization_mu, **convergence,
                                 mask=freq_mask[pop])
    plt.sca(axes[0])
    ksfs_dict[pop].plot_total(c=f'C{idx}', label=pop)
    plt.sca(axes[1])
    ksfs_dict[pop].plot('TCC>TTC', clr=True, label=pop, lw=3, alpha=0.5, c=f'C{idx}')
    plt.sca(axes[2])
    ksfs_dict[pop].mu.plot(('TCC>TTC',), t_gen=t_gen, clr=True,
                          label=pop,
                          lw=3, alpha=0.5)
axes[0].legend()
axes[2].set_xlim([1e3, 1e6])
plt.tight_layout()
if plot_dir:
    plt.savefig(f'{plot_dir}/europulse.Tennessen.pdf', dpi=300)
plt.show()

It fits the total SFS quite poorly, and timing of the TCC pulse seems to be incorrectly scaled as a result. The number of segregating variants observed does not match what's expected under this demography:

In [None]:
plt.figure(figsize=(3, 3))
for pop in pops:
    plt.plot(ksfs_dict[pop].X[freq_mask[pop], :].sum(),
             (ksfs_dict[pop].L @ ksfs_dict['CEU'].mu.Z)[freq_mask[pop], :].sum(),
             'o', label=pop)
plt.plot([.8e7, 1.05e7], [.8e7, 1.05e7], '--k')
plt.xlabel('observed S')
plt.ylabel('predicted S')
plt.legend()
plt.show()

We conclude that an older site-wise mutation rate estimate was cryptically modifying the diffusion timescale in Harris and Pritchard (2017) via the assumed demography. Tennessen et al. likely used a phylogenetically-calibrated mutation rate `~2.35e-8` (although no rate is reported), rather than an estimate based on trio sequencing.