In [1]:
# Generic libraries
import numpy as np
import pandas as pd
import scipy as sp
import tqdm
import seaborn as sns
from itertools import product
import inspect
import multiprocessing
import time
import os
import glob
import ipympl

%matplotlib widget
import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.ticker as mtick
from collections import namedtuple
from tabulate import tabulate
from IPython.display import Latex
from IPython.display import HTML
from IPython.core.pylabtools import figsize
from matplotlib import rc


In [2]:
# MDAnalysis
import MDAnalysis as mda
from MDAnalysis.analysis.rms import rmsd
from MDAnalysis.analysis.rms import RMSF
from MDAnalysis.analysis import diffusionmap, align, rms
from MDAnalysis.coordinates.base import Timestep
from MDAnalysis.analysis import contacts
from MDAnalysis.lib import distances
from MDAnalysis.analysis.base import analysis_class
from MDAnalysis.lib.distances import capped_distance, self_capped_distance
from MDAnalysis.lib.distances import distance_array, self_distance_array
import MDAnalysis.analysis.hydrogenbonds as hb

In [3]:
## autocorrelation estimate
def autocorrelation(x):
    n = len(x)
    variance = x.var()
    x = x-x.mean()
    #r = np.array([(x[:n-k]*x[-(n-k):]).sum() for k in range(n)]) #slow method using the definition
    r = np.correlate(x, x, mode = 'full')[-n:] # fast method, same result!
    result = r/(variance*(np.arange(n, 0, -1)))
    return result

In [4]:
## Import common data (e.g. pdb and psf files)
from pathlib import Path
simdir = Path('../data/00-external/')
PDB = simdir / '5Y36.pdb'
GRO = simdir / '5Y36_box.gro'

# set paths for output
plotpath = "../plots/"
datapath = "../data/analysis/"
output_name = "prod_global_"


In [5]:
## import full trajectory data 
trajdir = '../data/04-prod/'
XTC  = trajdir + 'cas9_prod_pbc_all.xtc'

## universe creation
u = mda.Universe(str(GRO), str(XTC))
#u_new = u.copy() # used as a reference in memory for mapping calculations
#ref0 = mda.Universe(str(GRO))
print(u.select_atoms("protein or name MG").n_atoms, 'protein atoms')

# other info
box = u.dimensions
print('box dimensions:', box)
nframes = u.trajectory.n_frames
print(nframes, 'frames')
dt = u.trajectory.dt # ns per frame, equal to nstxout*0.002
time = np.arange(0, dt*nframes, dt)
t_ns = time / 1000
print(t_ns[-1], 'ns')


22526 protein atoms
box dimensions: [183.38486  183.38486  183.38478   60.000008  60.000008  90.      ]
6401 frames
320.0 ns


 ctime or size or n_atoms did not match


## RMSD



In [None]:

stride = 1
reduced_index = list(range(0, nframes-1, stride))
ref_start = [u.select_atoms('protein and name CA').positions for ts in u.trajectory[0:1]][0]

rmsd_b = [rmsd(u.select_atoms('protein and name CA').positions, ref_start, \
                  center=True, superposition=True) for ts in u.trajectory[reduced_index]]


In [None]:
# plot of the backbone RMSD
fig = plt.figure()

plt.plot(t_ns[stride:-1:stride], rmsd_b[0:-1], lw=0.75)

plt.xlabel('Time [ns]')
plt.ylabel(r'RMSD [$\AA$]')
plt.title(r'RMSD of the C$\alpha$')
plt.ylim(bottom=2)
plt.grid()
#plt.legend()
#plt.autoscale(tight=True)
plt.show()
plt.savefig(os.path.join(plotpath, output_name+'rmsd_protein.pdf'))

In [None]:
stride = 1
reduced_index = list(range(1, nframes, stride))

ref_start = [u.select_atoms('(nucleic and name P)').positions \
             for ts in u.trajectory[0:1]][0]

rmsd_bn = [rmsd(u.select_atoms('(nucleic and name P)').positions, ref_start, \
                  center=True, superposition=True) for ts in u.trajectory[reduced_index]]


In [None]:
fig = plt.figure()

plt.plot(t_ns[stride:-1:stride], rmsd_bn[0:-1], lw=0.75)

plt.xlabel('Time [ns]')
plt.ylabel(r'RMSD [$\AA$]')
plt.title(r'RMSD of the phosphates')
plt.ylim(bottom=2)
plt.grid()
#plt.legend()
plt.show()
plt.savefig(os.path.join(plotpath, output_name+'rmsd_nucleic.pdf'))

### RMSD of  interesting protein domains

## RMSD 2D Map

In [None]:
aligner = align.AlignTraj(u, u, select='name CA', in_memory=True).run()
matrix = diffusionmap.DistanceMatrix(u, select='name CA').run()

In [None]:
plt.figure()
plt.imshow(matrix.results.dist_matrix, cmap='viridis')
plt.xlabel('Frame')
plt.ylabel('Frame')
plt.colorbar(label=r'RMSD ($\AA$)')
plt.savefig(os.path.join(plotpath, output_name+'rmsd_map.pdf'))
plt.show()

In [None]:
bool_arr = matrix.results.dist_matrix>5.0
high_rmsd_positions = np.where(bool_arr)[0]
print(len(high_rmsd_positions))
sum_rmsd = np.sum(bool_arr, axis=1) / nframes

plt.clf()
plt.plot(t_ns[0:-1:1], sum_rmsd[0:-1], lw=0.75)
plt.xlabel('Time [ns]')
plt.ylabel(r'RMSD > 5')
plt.title(r'Points with high reciprocal RMSD')
plt.grid()
#plt.legend()
plt.show()


## RMSF


$$RMSF_i = \left[\frac{1}{T}\sum_{t_j=1}^T |\mathbf{r}_i(t_j)-\mathbf{r}_i|^2\right]^{1/2}$$


In [None]:
from MDAnalysis.analysis.rms import RMSF
from MDAnalysis.analysis import align
from MDAnalysis.coordinates.memory import MemoryReader
from MDAnalysis.analysis.base import AnalysisFromFunction

In [None]:
average = align.AverageStructure(u, u, select='protein and name CA', ref_frame=1).run()
ref = average.results.universe
aligner = align.AlignTraj(u, ref, select='protein and name CA', in_memory=True).run()

In [None]:
c_alphas = u.select_atoms('protein and name CA')
R = rms.RMSF(c_alphas, verbose=True).run()

In [None]:
fig2 = plt.figure()
plt.plot(c_alphas.resids, R.results.rmsf, lw=0.8)
plt.xlabel('Residue number')
plt.ylabel('RMSF ($\AA$)')
#plt.axvspan(30, 59, zorder=0, alpha=0.2, color='green', label='NMP')
#plt.legend();
plt.grid();
plt.show()
plt.savefig(os.path.join(plotpath, output_name+'rmsf_ca.pdf'))

In [None]:
bool_arr = R.results.rmsf>4
high_rmsf_positions = np.where(bool_arr)[0]
print(high_rmsf_positions)

In [None]:
u.add_TopologyAttr('tempfactors') # add empty attribute for all atoms
protein = u.select_atoms('protein') # select protein atoms
for residue, r_value in zip(protein.residues, R.results.rmsf):
    residue.atoms.tempfactors = r_value
    #print(residue.atoms.tempfactors)

In [None]:
# suppress some MDAnalysis warnings about writing PDB files
import warnings
warnings.filterwarnings('ignore')
# save pdb with beta factor data
u.atoms.write(os.path.join(datapath, 'rmsf_tempfactors0-40.pdb')) #writes pdb with the beta factors to analysis

In [None]:
import nglview as nv
view = nv.show_mdanalysis(u)
view.update_representation(color_scheme='bfactor')
view

In [None]:
rmsf1 = RMSF(c_alphas, start=1, stop=nframes//4).run(start=1, stop=nframes//4)
rmsf2 = RMSF(c_alphas, start=nframes//4, stop=nframes//2).run(start=nframes//4, stop=nframes//2)
rmsf3 = RMSF(c_alphas, start=nframes//2, stop=3*nframes//4).run(start=nframes//2, stop=3*nframes//4)
rmsf4 = RMSF(c_alphas, start=3*nframes//4, stop=nframes-1).run(start=3*nframes//4, stop=nframes-1)

In [None]:
fig = plt.figure()
plt.plot(c_alphas.resnums, rmsf1.results.rmsf, lw=0.7, label='0-50 ns')
plt.plot(c_alphas.resnums, rmsf2.results.rmsf, lw=0.7, label='50-100 ns')
plt.plot(c_alphas.resnums, rmsf3.results.rmsf, lw=0.7, label='100-150 ns')
plt.plot(c_alphas.resnums, rmsf4.results.rmsf, lw=0.7, label='150-200 ns')
#plt.axvline(x=59, color='r', lw=0.8)
plt.ylim(top=6.5, bottom=0)
#plt.xlim(left=0)
plt.xlabel('Residue')
plt.ylabel('RMSF ($\AA$)')
plt.title(r'RMSF of the $\alpha$-carbons, during different parts of the simulation')
plt.grid()
plt.legend(loc=0)
plt.show()
plt.savefig(os.path.join(plotpath, output_name+'rmsf_part.pdf'))

In [None]:
bool_arr = rmsf3.results.rmsf>4
high_rmsf_positions = np.where(bool_arr)[0]
print(high_rmsf_positions)

In [None]:
average_prot = align.AverageStructure(u, u, select='protein', ref_frame=1).run()
ref_prot = average_prot.results.universe

In [None]:
# Save aligned trajectory
aligner = align.AlignTraj(u, ref_prot,
                           select='protein',
                           filename='./data/analysis/aligned_prot.dcd',
                           in_memory=False).run()
prot_al = u.select_atoms("protein")
prot_al.write(os.path.join(datapath,'aligned_prot.xtc'), frames='all')


## Radius of gyration

$$R_\mathrm{gyr} = \sqrt{\frac{1}{M}\sum_{i=1}^{N} m_i(\mathbf{r}_i - \mathbf{R})^2}$$

In [None]:
stride = 1
Rgyr = []
ca = u.select_atoms("backbone")
for ts in u.trajectory[0:nframes-1:stride]:
   Rgyr.append(ca.radius_of_gyration())
Rgyr = np.array(Rgyr)

rel_diff = (max(Rgyr) - min(Rgyr))/10
print(len(ca))

In [None]:
fig = plt.figure()
ax = plt.subplot(111)
ax.plot(t_ns[0:nframes-1:stride], Rgyr, lw=0.8, label=r"$R_G$")
ax.set_xlabel("time (ns)")
ax.set_ylabel(r"radius of gyration $R_G$ ($\AA$)")
plt.grid()
ax.figure.savefig(plotpath+output_name+"Rgyr_ca.pdf")