Mushi
==
_All that the rain promises and more..._

A notebook for testing `mushi`'s ability to invert data simulated under the forward model

API documentation can be viewed with
```python
help(mushi.η)
help(mushi.𝜇)
help(mushi.kSFS)
```

In [None]:
# %matplotlib inline 
%matplotlib notebook
import mushi
import histories
import numpy as np
from matplotlib import pyplot as plt
from scipy.special import expit
import time
import msprime
%cd stdpopsim
from stdpopsim import homo_sapiens
%cd ../

In [None]:
# plt.style.use('dark_background')

### Time grid

In [None]:
t = np.logspace(0, np.log10(3e4), 300)

### Demographic history $\eta(t)$ from the European population in Tennessen et al.

In [None]:
model = homo_sapiens.TennessenTwoPopOutOfAfrica()
dd = msprime.DemographyDebugger(Ne=model.default_population_size,
                                population_configurations=model.population_configurations,
                                demographic_events=model.demographic_events,
                                migration_matrix=model.migration_matrix)
y = 2 * dd.population_size_trajectory(np.concatenate(([0], t)))[:, 1]
η_Tennessen = histories.η(t, y)

### Demographic history $\eta(t)$ from the European population in Browning et al.

In [None]:
model = homo_sapiens.BrowningAmerica()
dd = msprime.DemographyDebugger(Ne=model.default_population_size,
                                population_configurations=model.population_configurations,
                                demographic_events=model.demographic_events,
                                migration_matrix=model.migration_matrix)
y = 2 * dd.population_size_trajectory(np.concatenate(([0], t)))[:, 1]
η_Browning = histories.η(t, y)

In [None]:
plt.figure(figsize=(3, 3))
η_Tennessen.plot(label='Tennessen')
η_Browning.plot(label='Browning')
plt.show()

### Mutation rate history $\mu(t)$
A 10 dimensional history with each mutation type a random mixture of 2 latent signatures:
- constant
- pulse
We will make the mutation rate low, so that the $k$-SFS is noisy and reconstruction of each of the 10 independently is difficult

In [None]:
tt = np.concatenate(([0], t))
flat = np.ones_like(tt)
# ramp = expit(-.01 * (tt - 50))
pulse = expit(.01 * (tt - 100)) - expit(.01 * (tt - 1000))
cols = 96
Z = np.zeros((len(t) + 1, cols))
μ0 = 10
np.random.seed(0)
pulse_idxs = []
flat_idxs = []
for col in range(cols):
    scale = np.random.lognormal(0, 0.2)
    pulse_weight = np.random.lognormal(-0.5, .5) if col < 5 else 0
    Z[:, col] = μ0 * (scale * (flat + pulse_weight * pulse))
    if pulse_weight:
        pulse_idxs.append(col)
    else:
        flat_idxs.append(col)

# # add a tiny bit of random diffusion to each column to give scale to the pulse after standardization
# Z_noise = np.ones_like(Z)
# for row in range(1, Z.shape[0]):
#     Z_noise[row, :] = Z_noise[row - 1, :] + np.random.randn(Z.shape[1])
# Z += .01 * Z_noise
    
    
μ = histories.μ(t, Z)


In [None]:
plt.figure(figsize=(6, 3))
μ.plot(flat_idxs, alpha=0.1, lw=2, c='C0', normed=False)
μ.plot(pulse_idxs, alpha=0.5, lw=2, c='C1', normed=False)
plt.show()

In [None]:
μ.clustermap(figsize=(10, 5), col_cluster=False)
plt.show()

### Simulate a $k$-SFS under this history using Tennessen demography
- We'll sample 200 haplotypes
- We simulate $\eta(t)$ misspecification by simulating from Tennessen, but inferring with Browning

In [None]:
n = 198
ksfs_Tennessen = mushi.kSFS(η_Tennessen, n=n)
ksfs_Tennessen.simulate(μ, seed=1)

# # misspecification
# ksfs = mushi.kSFS(η_Browning, X=ksfs_Tennessen.X)
ksfs = ksfs_Tennessen

plt.figure(figsize=(6, 3))
ksfs.plot(flat_idxs, alpha=0.1, lw=2, c='C0', normed=True)
ksfs.plot(pulse_idxs, alpha=0.5, lw=2, c='C1', normed=True)
plt.show()

In [None]:
ksfs.clustermap(figsize=(10, 5), col_cluster=False)
plt.show()
# plt.savefig('/Users/williamdewitt/Downloads/sfs.pdf', transparent=True)

### TMRCA CDF

In [None]:
plt.figure(figsize=(3, 3))
plt.plot(η_Browning.change_points, ksfs.tmrca_cdf())
plt.xlabel('$t$')
plt.ylabel('TMRCA CDF')
plt.ylim([0, 1])
plt.xscale('symlog')
plt.tight_layout()
plt.show()

### Invert the $k$-SFS conditioned on $\eta(t)$ to get $\boldsymbol\mu(t)$
Accelerated proximal gradient descent

In [None]:
μ_inferred, f_trajectory = ksfs.infer_μ(# loss function parameters
                                        fit='prf',
                                        exclude_singletons=False,
                                        bins=None,
                                        # time derivative regularization parameters
                                        λ_tv=1e1,
                                        α_tv=0,#.999,
                                        # spectral regularization parameters
                                        λ_r=1e-1,
                                        α_r=.999,
                                        hard=True,                                        
                                        # convergence parameters
                                        max_iter=1000,
                                        tol=1e-10,
                                        γ=0.8)

In [None]:
plt.figure(figsize=(4, 2))
plt.plot(f_trajectory)
plt.xlabel('iterations')
plt.ylabel('cost')
plt.xscale('symlog')
plt.tight_layout()
plt.show()

The inferred histories for each mutation type superimposed on the 3 underlying signatures

In [None]:
plt.figure(figsize=(6, 3))
ksfs.plot(flat_idxs, μ=μ_inferred, alpha=0.1, lw=2, c='C0', normed=True)
ksfs.plot(pulse_idxs, μ=μ_inferred, alpha=0.5, lw=2, c='C1', normed=True)
plt.savefig('/Users/williamdewitt/Downloads/fit.pdf', transparent=True)
plt.show()

In [None]:
plt.figure(figsize=(6, 3))
μ.plot(flat_idxs, alpha=0.1, lw=2, c='C0', normed=False, ls='--')
μ.plot(pulse_idxs, alpha=0.5, lw=2, c='C1', normed=False, ls='--')
μ_inferred.plot(flat_idxs, alpha=0.1, lw=2, c='C0', normed=False)
μ_inferred.plot(pulse_idxs, alpha=0.5, lw=2, c='C1', normed=False)
plt.savefig('/Users/williamdewitt/Downloads/inferred.pdf', transparent=True)
plt.show()

Heatmap of the inferred mutation spectrum history

In [None]:
μ_inferred.clustermap(figsize=(10, 5), col_cluster=False)
plt.show()

plot $\chi^2$ goodness of fit for each $k$-SFS matrix element, and compute $\chi^2$ goodness of fit test for the $k$-SFS matrix as a whole

In [None]:
ksfs.clustermap(μ_inferred, figsize=(10, 5), cmap='Reds')
plt.show()

An example column from each of the three signatures

In [None]:
plt.figure(figsize=(3, 20))
for i in range(1, 11):
    plt.subplot(10, 1, i)
    ksfs.plot1(i, μ=μ_inferred, prf_quantiles=True)
plt.tight_layout()
plt.show()

### Singular value spectrum of $Z$

In [None]:
plt.figure(figsize=(3, 3))
plt.bar(range(μ_inferred.Z.shape[1]), np.linalg.svd(Z, compute_uv=False))
plt.yscale('log')
plt.tight_layout()
plt.show()