# Back-calculate experimental data from structure

Here we back-calcuate the experimental quantities from PDB structures and trjactories.
The experimental datasets are
- Dataset A: Exact eNOEs (Nichols et al. (2018b)), consisting in 62 bidirectional exact NOE, 177 unidirectional eNOE and 77 generic normalized eNOE (gn-125 eNOE). This dataset alone was used to determine the structure of the UUCG tetraloop with PDB accession codes 6BY4 and 6BY5.In addition to the original dataset we added 1 new eNOE and 6 new gn-eNOEs:

```
NEW bidirectional eNOE
C8_H4';G10_H8  5.77 0.577

New gn-eNOE, used as lower-bound distances 
 C8_H5;G10_H1'  5.8 0.58 
 C8_H5;G10_H8 5.87 0.587  
 C8_H2';G10_H8 7.06 0.706
 C8_H3';G9_H8 5.78 0.578 
 C8_H2';G9_H8 5.92 0.592 
 C8_H1';G10_H4' 5.28 0.528 
 C8_H2';G9_H2' 4.62 0.462 
```

- Dataset B. Taken from the study “High-Resolution NMR Structure of a RNA model system: the 14mer cUUCGg tetraloop hairpin RNA“ and corresponding PDB structure 2KOC. 
- Dataset C. 38 (RDC1) plus 13 (RDC2) residual dipolar couplings. These RDCs have been used in conjunction with MD simulations to obtain a dynamic ensemble of the UUCG tetraloop. Borkar et al. (2017).

- Dataset D. 91 solvent paramagnetic resonance enhancement (sPRE) measurements from Hartlmüller et al. (2017).



# Dataset A

## bidirectional exact noes (eNOE). 
From the original list of 72 distances, 10 datapoints were removed, corresponding to distances between H41-H42, H21-H22, H5'-H5'' and H5-H6 protons in the same nucleotide. 6 URA H2' 7 URA H6 bi-dir 4.56 and 6 URA H2' 6 URA H6 bi-dir 3.83 are duplicated and were also removed. The experimental error was set to 10% of the eNOE. For all NOE data used here we use the standard $^{-6}$ averaging, i.e. $\text{NOE}_{\text{CALC}} = (\sum_i w_i r_i^{-6})^{-1/6}$


In [2]:
import numpy as np
import glob

# helper functions, subsititute strings. This is because the name of hydrogens is a mess
alt = {"H2'":"1H2'","H5''":"2H5'","H5'":"1H5'","HO2'":"2HO'","H5\"":"2H5'"}
def sub(ss):
    at = ss.split("_")[1]
    if(at in alt):
        at = alt[at]
    return ss.split("_")[0] + "-" + at

# read experimental datafile and returns a list of labels and experimental values
def read_exp(f_exp):
    
    labels = []
    vals = []
    fh = open(f_exp)
    
    for line in fh:
        if("#" not in line):
            r1 = line.split()[0].split(";")[0]
            r2 = line.split()[0].split(";")[1]
            v1 = np.sort([r1,r2])
            qq = v1[0] +"/"+ v1[1]
            if(qq in labels):
                print("# DUPLICATE. Skipping data.."),
                print(qq,vals[labels.index(qq)], line),
            else:
                vals.append([float(line.split()[1]),float(line.split()[2])])
                labels.append(qq)
            
    fh.close()
    return labels,vals


# find indeces in topology corresponding to labels in experimental datafile
def get_idxs(labels,top):
    
    atoms = []
    for atom in top.atoms:
        aa = str(atom).split("-")[1]
        if(aa in alt): aa = alt[aa]
        atoms.append("%s-%s" % (str(atom).split("-")[0],aa))
    pairs = []
    for el in labels:
        ss = el.split("/")
        at1= sub(ss[0])
        at2 = sub(ss[1])
        if(at1 in atoms and at2 in atoms):
            pairs.append([atoms.index(at1),atoms.index(at2)])
        else:
            print("# Warning: Either %s or %s are missing" % (at1,at2)) 
            return 0
    print("# Found %d pairs out of %d" % (len(pairs),len(labels)))
    return np.array(pairs)


def write(data,fname):
    fh = open(fname,"w")
    stri = ""
    for j in range(data.shape[0]):
        stri += "%10d "% j 
        stri += " ".join(["%10.4e" % data[j,k] for k in range(data.shape[1])])
        stri += "\n"
    fh.write(stri)
    fh.close()

    
#######################

    
import mdtraj as md

top = "data/PDB/2koc_gmx.pdb"
traj = "data/traj_temp_f_0.xtc"
labels,vals = read_exp("data/exp/set_A/eNOE.exp.dat")

md_trj = md.load(traj,top=top)
pairs = get_idxs(labels,md_trj.topology)
# convert to Angstrom
dists_md = 10*md.compute_distances(md_trj,pairs)
# write to file 
write(dists_md,"data/calc/set_A/eNOE.calc.dat")

pdbs = ["2koc","6by5"]
dists_pdbs = []
for  p in pdbs:
    top = "data/PDB/%s.pdb" % p 
    traj = "data/PDB/%s.pdb" % p 
    md_trj = md.load(traj,top=top)
    pairs = get_idxs(labels,md_trj.topology)
    dists = 10.0*md.compute_distances(md_trj,pairs)
    write(dists,"data/calc/set_A/%s.eNOE.calc.dat" %p)
    

# Found 63 pairs out of 63




# Found 63 pairs out of 63
# Found 63 pairs out of 63


## unidirectional eNOE
From the original list of 189 distances, 6 intra-residue NOEs were removed as described above, and RGUA H1' 3 RCYT H6 uni-dir 5.01 was duplicated. The experimental error was set to 15%. After a preliminary refinement round we found it difficult to refine the simulation so as to match the following eNOE: C13_H5;C13_H5'';G12_H5'';C13_H5; #
C5_H5;C5_H5''; A4_H62;U11_H3; A4_H61;U11_H3; C5_H4';C5_H5; U7_H5'';G9_H1. Since these eNOEs have a high value of sigma fit, they were removed from the dataset. 


In [4]:

labels,vals = read_exp("data/exp/set_A/eNOE_unidir.exp.dat")


top = "data/PDB/2koc_gmx.pdb"
traj = "data/traj_temp_f_0.xtc"
#######################
md_trj = md.load(traj,top=top)
pairs = get_idxs(labels,md_trj.topology)
# convert to Angstrom
dists_md = 10*md.compute_distances(md_trj,pairs)
# write to file 
write(dists_md,"data/calc/set_A/eNOE_unidir.calc.dat")

pdbs = ["2koc","6by5"]
dists_pdbs = []
for  p in pdbs:
    top = "data/PDB/%s.pdb" % p 
    traj = "data/PDB/%s.pdb" % p 
    md_trj = md.load(traj,top=top)
    pairs = get_idxs(labels,md_trj.topology)
    dists = 10.0*md.compute_distances(md_trj,pairs)
    write(dists,"data/calc/set_A/%s.eNOE_unidir.calc.dat" %p)
    

# Found 177 pairs out of 177
# Found 177 pairs out of 177
# Found 177 pairs out of 177


## gn-eNOE
From 88 datapoints, 9 were removed and 5 RCYT H5" 5 RCYT H5, 13 RCYT H2' 13 RCYT H5 were duplicated, resulting in 77 upper-limits distances. The experimental error was set to 10%

In [5]:

labels,vals = read_exp("data/exp/set_A/gn_eNOE.exp.dat")
top = "data/PDB/2koc_gmx.pdb"
traj = "data/traj_temp_f_0.xtc"
#######################
md_trj = md.load(traj,top=top)
pairs = get_idxs(labels,md_trj.topology)
# convert to Angstrom
dists_md = 10*md.compute_distances(md_trj,pairs)
# write to file 
write(dists_md,"data/calc/set_A/gn_eNOE.calc.dat")

pdbs = ["2koc","6by5"]
dists_pdbs = []
for  p in pdbs:
    top = "data/PDB/%s.pdb" % p 
    traj = "data/PDB/%s.pdb" % p 
    md_trj = md.load(traj,top=top)
    pairs = get_idxs(labels,md_trj.topology)
    dists = 10.0*md.compute_distances(md_trj,pairs)
    write(dists,"data/calc/set_A/%s.gn_eNOE.calc.dat" %p)

# Found 84 pairs out of 84
# Found 84 pairs out of 84
# Found 84 pairs out of 84


# Dataset B
Data taken from the study “High-Resolution NMR Structure of a RNA model system: the 14mer cUUCGg tetraloop hairpin RNA“ and corresponding PDB structure 2KOC. 

## NOE
We sourced the 251 unambigous NOE data from the .mr restraint file deposited on the PDB. Experimental error was calculated as $\sigma = (rmax-rmin)/2$

In [6]:
labels,vals = read_exp("data/exp/set_B/NOE.exp.dat")

#######################
top = "data/PDB/2koc_gmx.pdb"
traj = "data/traj_temp_f_0.xtc"
md_trj = md.load(traj,top=top)
pairs = get_idxs(labels,md_trj.topology)
# convert to Angstrom
dists_md = 10*md.compute_distances(md_trj,pairs)
# write to file 
write(dists_md,"data/calc/set_B/NOE.calc.dat")

pdbs = ["2koc","6by5"]
dists_pdbs = []
for  p in pdbs:
    top = "data/PDB/%s.pdb" % p 
    traj = "data/PDB/%s.pdb" % p 
    md_trj = md.load(traj,top=top)
    pairs = get_idxs(labels,md_trj.topology)
    dists = 10.0*md.compute_distances(md_trj,pairs)
    write(dists,"data/calc/set_B/%s.NOE.calc.dat" %p)

# Found 251 pairs out of 251
# Found 251 pairs out of 251
# Found 251 pairs out of 251


# Scalar couplings

Available 3J couplings relative to angles
- 1H5-P, 2H5-P, C4-P (angle $\beta$). Data for residue 1 are omitted because P is not present at the 5' end in simulations.
- H3-P, C4-P(+1) (angle $\epsilon$). Data for C2-P are not used because I could not find valid Karplus relationships.
- 2H5H4, 1H5H4 (angle $\gamma$).Data for C4-1H5/2H5 are not used because I could not find valid Karplus relationships.
- H1'-H2', H2'-H3', H3'-H4' for the sugar.
The error was taken 1.5HZ for all data.  In total, there are 96 scalar couplings. For calculating the scalar coupling from structure we used the Karplus relationship as defined in [baRNAba](https://github.com/srnas/barnaba).

In [4]:
# read experimental data
import barnaba as bb
from barnaba import definitions
jj = [l for l in (definitions.couplings_idx)]
exp = []
dd = []
for line in open("data/exp/set_B/J3.exp.dat"):
    if("#" not in line):
        a = line.split()[0].split("-")[0]
        try: a = int(a)
        except: a = int(a[1:])
        b = line.split()[0].split("-")[1]
        dd.append("%d-%s" % (a,b))
        
        exp.append([float(x) for x in line.split()[1:]])
exp = np.array(exp)

jjr = []
for  c in range(14):
    for j in jj:
        jjr.append("%d-%s" % (c+1,j))
idxs = [jjr.index(el) for el in dd]


In [5]:
# calculate scalar couplings for trajectory and write to file
top = "data/PDB/2koc_gmx.pdb"
native = "data/PDB/2koc_gmx.pdb"
traj = "data/traj_temp_f_0.xtc"

couplings,res = bb.jcouplings(traj,topology=top) 
data = couplings[:,:,:].reshape(-1,len(jjr))[:,idxs]
write(data,"data/calc/set_B/J3.calc.dat")

couplings_2koc,res_2koc = bb.jcouplings("data/PDB/2koc.pdb",topology="data/PDB/2koc.pdb")
data_2koc = couplings_2koc[:,:,:].reshape(-1,len(jjr))[:,idxs]
write(data_2koc,"data/calc/set_B/2koc.J3.calc.dat")

couplings_6by5,res_6by5 = bb.jcouplings("data/PDB/6by5.pdb",topology="data/PDB/6by5.pdb")
data_6by5 = couplings_6by5[:,:,:].reshape(-1,len(jjr))[:,idxs]
write(data_6by5,"data/calc/set_B/6by5.J3.calc.dat")

# Loading data/traj_temp_f_0.xtc 
# Loading data/PDB/2koc.pdb 
# Loading data/PDB/6by5.pdb 


# cross-correlated relaxation rates
Cross-correlated relaxation rates are calculated using the script available on [github](https://github.com/sbottaro/CCRR). Here we just format the data to make them compatible with the reweighting script.


In [21]:

import pandas as pd

# read experimental data 
df_exp = pd.read_csv("data/exp/set_B/CCRR.exp.dat",sep="\s+",names=["label","exp_val"],comment="#")


# read 6BY5
df_calc_6by5 = pd.read_csv("data/calc/set_B/6by5_ccrr_calc.dat",sep="\s+").transpose()
# merge
df_calc_6by5_ok = df_exp.merge(df_calc_6by5,left_on="label",right_index=True,how="left")
# write to file
df_calc_6by5_ok = (df_calc_6by5_ok[list(df_calc_6by5_ok.columns[2:])]).transpose()
df_calc_6by5_ok.to_csv("data/calc/set_B/6by5.CCRR.calc.dat",sep=" ",float_format='%8.4e',header=False)


# read 2KOC
df_calc_2koc = pd.read_csv("data/calc/set_B/2koc_ccrr_calc.dat",sep="\s+").transpose()
# merge
df_calc_2koc_ok = df_exp.merge(df_calc_2koc,left_on="label",right_index=True,how="left")
# write to file
df_calc_2koc_ok = (df_calc_2koc_ok[list(df_calc_2koc_ok.columns[2:])]).transpose()
df_calc_2koc_ok.to_csv("data/calc/set_B/2koc.CCRR.calc.dat",sep=" ",float_format='%8.4e',header=False)


# read 2KOC
df_calc = pd.read_csv("data/calc/set_B/traj_temp_f_0_ccrr_calc.dat",sep="\s+").transpose()
# merge
df_calc_ok = df_exp.merge(df_calc,left_on="label",right_index=True,how="left")
# write to file
df_calc_ok = (df_calc_ok[list(df_calc_ok.columns[2:])]).transpose()
df_calc_ok.to_csv("data/calc/set_B/CCRR.calc.dat",sep=" ",float_format='%8.4e',header=False)

#df_calc_2koc_ok = df_exp.merge(df_calc_2koc,left_on="label",right_index=True,how="left")

#(df[["label","exp_val",0]]).to_csv("2koc_ccrr.dat",sep=" ",float_format='%.2f')

## RDC


32 RDC were taken from the .mr restraint file deposited on the PDB.
For each samples, RDC are calculated using the software [pales](https://spin.niddk.nih.gov/bax/software/PALES/)

    `pales -pdb sample.pdb -inD exp/rdc_set_c_bme.dat -outD outfile -H -pf1 -wv 0.022`
    

The ensemble average is then rescaled globally by a factor 

$L = \sum_i \text{exp}_i \text{avg}_i/\sum_i \text{avg}_i \text{avg}_i$

Where the sum runs over the experimental data and $\text{avg}_i = \sum_j w_j F(x_j)$
is the pales prediction $F(x_j)$ averaged over the configurations $x_j$.
The error sigma was set to 1Hz. 

# Dataset C

RDC taken from "Simultaneous NMR characterisation of multiple minima in the free energy landscape of an RNA UUCG tetraloop" are devided in two sub-datasets, RDC_1 ( 39 datapoints) and RDC_2 (14 datapoints). Back-calculation was performed as in set B. The experimental error was set to 1.4Hz (RDC_1) and 2.2 Hz (RDC_2) as described in the original paper.



# Dataset D

Solvent PRE data were obtained from the authors of the study  "RNA structure refinement using NMR solvent accessibility data. Sci Rep. 2017; 7: 5393." The back-calculation of experimental measure from structure was performed using the program from Chun Tang's lab, available [here](http://www.tanglab.org/resources/programs) using a probe radius of 0.35 nm.

Since RDC and sPRE calculations both require PDB files, we back calculate RDC in dataset B, dataset C and sPRE  in the code below (att! takes a looong time).


In [9]:
import os
from joblib import Parallel, delayed


traj = "data/traj_temp_f_0.xtc"
top = "data/PDB/2koc_gmx.pdb"
atoms = ["H1","H2","H3","H41","H42","H61","H62","H5","H21","H22","H3'","H4'","1H5'","2H5'","1H2'","H1'","H6","H8"]
pales_cmd = "/home/sbottaro/Software/pales/linux/pales"

md_trj = md.load(traj,top=top)


def do(i):
    pdb="data/PDB/sample_%08d.pdb" % i
    out_B="data/calc/set_B/sample_%08d" % i
    out_C1="data/calc/set_C/sample_%08d_1" % i
    out_C2="data/calc/set_C/sample_%08d_2" % i
    out_D="data/calc/set_D/sample_%08d" % i
    md_trj[i].save(pdb)
    cmdB = "%s -pdb %s -inD data/exp/set_B/RDC.pales.exp.dat -outD %s -H -pf1 -wv 0.022" % (pales_cmd,pdb,out_B)
    cmdC1 = "%s -pdb %s -inD data/exp/set_C/RDC1_MD.pales.exp.dat -outD %s -H -pf1 -wv 0.022" % (pales_cmd,pdb,out_C1)
    cmdC2 = "%s -pdb %s -inD data/exp/set_C/RDC2_MD.pales.exp.dat -outD %s -H -pf1 -wv 0.022" % (pales_cmd,pdb,out_C2)
    os.system(cmdB)
    os.system(cmdC1)
    os.system(cmdC2)
    #for at in atoms:
    #    ee = at.replace("'","p")
    #    cmdD = "data/script/spre.x %s 3.5 0 \"%s\" > %s_%s"   % (pdb,at,out_D,ee)
    #    os.system(cmdD)
    
#oo = Parallel(n_jobs=40)(delayed(do)(i) for i in range(len(md_trj)))


    

In [18]:
# now do the above again for PDB files. Note that H naming is different in RDC1 and RDC2

lista = glob.glob("data/PDB/[2-6]*.pdb")
for pdb in lista:
    ii = pdb.split("/")[-1].split(".pdb")[0]
    out_B="data/calc/set_B/%s" % ii
    out_C1="data/calc/set_C/%s_1" % ii
    out_C2="data/calc/set_C/%s_2" % ii
    out_D="data/calc/set_D/%s" % ii

    cmdB = "%s -pdb %s -inD data/exp/set_B/RDC.pales.exp.dat -outD %s -H -pf1 -wv 0.022" % (pales_cmd,pdb,out_B)
    cmdC1 = "%s -pdb %s -inD data/exp/set_C/RDC1.pales.exp.dat -outD %s -H -pf1 -wv 0.022" % (pales_cmd,pdb,out_C1)
    cmdC2 = "%s -pdb %s -inD data/exp/set_C/RDC2.pales.exp.dat -outD %s -H -pf1 -wv 0.022" % (pales_cmd,pdb,out_C2)
    os.system(cmdB)
    os.system(cmdC1)
    os.system(cmdC2)
    #for at in atoms:
    #    ee = at.replace("'","p")
    #    cmdD = "data/script/spre.x %s 3.5 0 \"%s\" > %s_%s"   % (pdb,at,out_D,ee)
    #    os.system(cmdD)


In [17]:
# bash code below to reformat the output

# for el in sample*; do echo -n ${el:7:9} >> RDC.calc.dat; tail -n 32 $el | awk '{printf "  " $9 }END{printf "\n"}' >> RDC.calc.dat ;done"
# for el in sample*_1; do echo -n ${el:7:8} >> RDC1.calc.dat; tail -n 39 $el | awk '{printf "  " $9 }END{printf "\n"}' >> RDC1.calc.dat ;done
# for el in sample*_2; do echo -n ${el:7:8} >> RDC2.calc.dat; tail -n 14 $el | awk '{printf "  " $9 }END{printf "\n"}' >> RDC2.calc.dat ;done




for el in sample*; do echo -n ${el:7:9} >> data/calc/set_B/RDC.calc.dat; tail -n 32 $el | awk '{printf "  " $9 }END{printf "\n"}' >> data/calc/set_B/RDC.calc.dat ;done
#
