<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Mini-project-II" data-toc-modified-id="Mini-project-II-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Mini project II</a></span><ul class="toc-item"><li><span><a href="#Data-preparation-and-MSM-estimation/validation" data-toc-modified-id="Data-preparation-and-MSM-estimation/validation-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Data preparation and MSM estimation/validation</a></span></li></ul></li></ul></div>

# Mini project II

Content:
- Stationary distribution and free energies
- Eigenvectors
- Metastable states
- Mean firts passage times
- Committors

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import mdshare
import pyemma
import deeptime as dt


def visualize_metastable(samples, cmap, selection='not element H'):
    """ visualize metastable states
    Parameters
    ----------
    samples: list of mdtraj.Trajectory objects
        each element contains all samples for one metastable state.
    cmap: matplotlib.colors.ListedColormap
        color map used to visualize metastable states before.
    selection: str
        which part of the molecule to selection for visualization. For details have a look here:
        http://mdtraj.org/latest/examples/atom-selection.html#Atom-Selection-Language
    """
    import nglview
    from matplotlib.colors import to_hex

    widget = nglview.NGLWidget()
    widget.clear_representations()
    ref = samples[0]
    for i, s in enumerate(samples):
        s = s.superpose(ref, atom_indices=s.top.select('resid 2 3 and mass > 2'))
        s = s.atom_slice(s.top.select(selection))
        comp = widget.add_trajectory(s)
        comp.add_licorice()

    # this has to be done in a separate loop for whatever reason...
    x = np.linspace(0, 1, num=len(samples))
    for i, x_ in enumerate(x):
        c = to_hex(cmap(x_))
        widget.update_licorice(color=c, component=i, repr_index=i)
        widget.remove_cartoon(component=i)
    return widget


pdb = mdshare.fetch('pentapeptide-impl-solv.pdb', working_directory='data')
files = mdshare.fetch('pentapeptide-*-500ns-impl-solv.xtc', working_directory='data')

## Data preparation and MSM estimation/validation

We load backbone torsions into memory, use a 4D TICA projection at lag time five steps, cluster with 75 $k$-means centers, and show the first 10 implied timescales with errorbars:

In [None]:
from timescales import implied_timescales_msm

In [None]:
features = pyemma.coordinates.featurizer(pdb)
features.add_backbone_torsions(cossin=True, periodic=False)
data = pyemma.coordinates.load(files, features=features)

tica_estimator = dt.decomposition.TICA(lagtime=5, dim=4)
tica = tica_estimator.fit(data).fetch_model()
tica_output = [tica.transform(x) for x in data]

kmeans_estimator = dt.clustering.KMeans(75, max_iter=75, fixed_seed=1)
kmeans = kmeans_estimator.fit(np.concatenate(tica_output)[::10]).fetch_model()
dtrajs = [kmeans.transform(x) for x in tica_output]

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
pyemma.plots.plot_feature_histograms(
    np.concatenate(tica_output),
    ax=axes[0],
    feature_labels=['IC1', 'IC2', 'IC3', 'IC4'],
    ylog=True)
pyemma.plots.plot_free_energy(
    *np.concatenate(tica_output)[:, :2].T, ax=axes[1], legacy=False)
axes[1].scatter(*kmeans.cluster_centers[:, :2].T, s=15, c='k')
axes[1].set_xlabel('IC 1')
axes[1].set_ylabel('IC 2')
pyemma.plots.plot_implied_timescales(
    implied_timescales_msm(dtrajs, lagtimes=np.arange(1, 51), nits=10),
    units='ns',
    dt=0.1,
    ax=axes[2])
fig.tight_layout()

Then, we estimate a Bayesian MSM at lag time five steps and do a CK test with five metastable states:

In [None]:
counts = dt.markov.TransitionCountEstimator(5, 'effective').fit(dtrajs).fetch_model().submodel_largest()
msm_estimator = dt.markov.msm.BayesianMSM()
msm = msm_estimator.fit(counts).fetch_model()

nstates = 5
validator = msm_estimator.chapman_kolmogorov_validator(nstates, mlags=6)
cktest = validator.fit(dtrajs).fetch_model()

pyemma.plots.plot_cktest(cktest, dt=0.1, units='ns');

Visualise the MSM stationary distribution and the reweighted free energy surface in the first two ICs:

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharex=True, sharey=True)
pyemma.plots.plot_contour(
    *np.concatenate(tica_output)[:, :2].T,
    msm.prior.stationary_distribution[np.concatenate(dtrajs)],
    ax=axes[0],
    mask=True,
    cbar_label='stationary distribution')
pyemma.plots.plot_free_energy(
    *np.concatenate(tica_output)[:, :2].T,
    weights=np.concatenate(msm.prior.compute_trajectory_weights(dtrajs)),
    ax=axes[1],
    legacy=False)
for ax in axes.flat:
    ax.set_xlabel('IC 1')
axes[0].set_ylabel('IC 2')
axes[0].set_title('Stationary distribution', fontweight='bold')
axes[1].set_title('Reweighted free energy surface', fontweight='bold')
fig.tight_layout()

Show the first four nontrivial right eigenvectors projected into the first two ICs:

In [None]:
eigvec = msm.prior.eigenvectors_right()
print('The first eigenvector is one: {} (min={}, max={})'.format(
    np.allclose(eigvec[:, 0], 1, atol=1e-15), eigvec[:, 0].min(), eigvec[:, 0].max()))

fig, axes = plt.subplots(1, 4, figsize=(15, 3), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
    pyemma.plots.plot_contour(
        *np.concatenate(tica_output)[:, :2].T,
        eigvec[np.concatenate(dtrajs), i + 1],
        ax=ax,
        cmap='PiYG',
        cbar_label='{}. right eigenvector'.format(i + 2),
        mask=True)
    ax.set_xlabel('IC 1')
axes[0].set_ylabel('IC 2')
fig.tight_layout()

Show the metastable state assigments projected into the first two ICs:

In [None]:
pcca = msm.prior.pcca(5)

In [None]:
metastable_traj = pcca.assignments[np.concatenate(dtrajs)]

fig, ax = plt.subplots(figsize=(5, 4))
_, _, misc = pyemma.plots.plot_state_map(
    *np.concatenate(tica_output)[:, :2].T, metastable_traj, ax=ax)
ax.set_xlabel('IC 1')
ax.set_ylabel('IC 2')
misc['cbar'].set_ticklabels([r'$\mathcal{S}_%d$' % (i + 1)
                             for i in range(nstates)])
fig.tight_layout()

Sample 50 frames from the five metastable distributions and visualise using nglview:

In [None]:
indices = dt.markov.sample.compute_index_states(dtrajs)
sample_indices = dt.markov.sample.indices_by_distribution(indices, pcca.metastable_distributions, 50)

my_samples = [pyemma.coordinates.save_traj(files, idist, outfile=None, top=pdb)
              for idist in sample_indices]

cmap = mpl.cm.get_cmap('viridis', nstates)
visualize_metastable(my_samples, cmap)

Compute the stationary probabilities and free energies for the five metastable states:

In [None]:
print('state\tπ\t\tG/kT')
for i, s in enumerate(pcca.sets):
    p = msm.prior.stationary_distribution[s].sum()
    print('{}\t{:f}\t{:f}'.format(i + 1, p, -np.log(p)))

Compute mean first passage times (MFPTs) between all five metastable states:

In [None]:
from itertools import product

mfpt = np.zeros((nstates, nstates))
for i, j in product(range(nstates), repeat=2):
    mfpt[i, j] = msm.prior.mfpt(
        pcca.sets[i],
        pcca.sets[j])

from pandas import DataFrame
print('MFPT / ns:')
DataFrame(np.round(mfpt, decimals=2), index=range(1, nstates + 1), columns=range(1, nstates + 1))

Compute the coarse-grained flux from metastable state 1 to metastable state 3 and visualise the commitor projected into the first two ICs:

In [None]:
start, final = 1, 3
A = pcca.sets[start]
B = pcca.sets[final]
flux = msm.prior.reactive_flux(A, B)

cg, cgflux = flux.coarse_grain(pcca.sets)

fig, ax = plt.subplots(figsize=(5, 4))
pyemma.plots.plot_contour(
    *np.concatenate(tica_output)[:, :2].T,
    flux.forward_committor[np.concatenate(dtrajs)],
    cmap='brg',
    ax=ax,
    mask=True,
    cbar_label=r'committor $\mathcal{S}_%d \to \mathcal{S}_%d$' % (
        start + 1, final + 1))
fig.tight_layout()