# Compare simulations with experiments

We here compare back-calculated and experimental data. We calculate the RMSE, chi2, pearson correlation coefficient and number of violations on the individual datasets

In [1]:
import sys
import numpy as np
import os
# add BME to the path (should be changed and depends where the BME script is located)
sys.path.append('/home/sbottaro/Software/python_lib/BME/')
import bme_reweight as bme
# set the intial weights from metadynamics. The bias is passed along with kbt, so 
# that the initial weights are set proportional to exp(bias/kbt)
kbt = 0.008314462*280
bias = [float(line.split()[1]) for line in open("data/bias") if "#" not in line]
# this is a list of available data

#lista1 = [   ["set_A","eNOE","eNOE_unidir","gn_eNOE"] ]

lista2 = [ ["set_A1","eNOE","eNOE_unidir"],\
        ["set_B","NOE","J3","RDC"],\
        ["set_C","RDC1","RDC2"],\
        ["set_D","sPRE"]]


Now we collect all the data from the different ensembles

In [9]:
import numpy as np
import os
import subprocess
from scipy import stats


def calc_stuff(filename):
    dd = np.array([[float(x) for x in line.split()[1:]] for line in open(filename) if "#" not in line])
    return dd

ensembles = ["set_MD","set_A1","set_B","set_C","set_D","2koc","6by5"]
thetas=[20,20,50,60,500]

for i in range(len(lista2)):
    for j in range(1,len(lista2[i])):
        data_tmp = []
        stds_tmp = []
        #fig, ax = plt.subplots(3,1,figsize=(6,18))
        for o,ens in enumerate(ensembles):
            d = []
            if("set" in ens):
                if(ens=="set_MD"):
                    ens1 = "set_A1"
                else:
                    ens1 = ens
                for k in range(4):
                    ff = "data/comparison_n/%s/rw_%s_%s_theta_%d_rep_%d.stats.dat" % (ens1,lista2[i][0],lista2[i][j],thetas[o],k)
                    d.append(calc_stuff(ff))
            else:
                ff = "data/comparison/%s/%s_%s.stats.dat" % (lista2[i][0],ens,lista2[i][j])
                d.append(calc_stuff(ff))
            d = np.array(d)
            if(ens=="set_MD"):
                if(d.shape[2]==4):
                    ee = d[0,:,0]
                    ees = d[0,:,1]
                    bb = d[:,:,2]
                else:
                    ee = d[0,:,4]
                    ees = d[0,:,5]
                    bb = d[:,:,6]    
            else:
                if(d.shape[2]==4):
                    ee = d[0,:,0]
                    ees = d[0,:,1]
                    bb = d[:,:,3]
                else:
                    ee = d[0,:,4]
                    ees = d[0,:,5]
                    bb = d[:,:,7]
   
            aa = np.average(bb,axis=0)
            ss = np.std(bb,axis=0,ddof=1)
            #ax[o].errorbar(ee,aa,xerr=ees,yerr=ss,fmt="o")
            #ax[o].set_title("%s %s" % (lista2[i][j],ens)) 
            #mma = np.max([ax[o].get_ylim()[1], ax[o].get_xlim()[1]])
            #mmi = np.min([ax[o].get_ylim()[0], ax[o].get_xlim()[0]])
            #ax[o].set_ylim(mmi,mma)
            #ax[o].set_xlim(mmi,mma)
            if(o!=10):
                diff = (bb-ee)**2
                if(lista2[i][j]=="gn_eNOE"):
                    #print(bb.shape,diff.shape,ee.shape)
                    for w in range(diff.shape[0]):
                        diff[w,np.where((bb[w,:]-ee)<0.0)] = 0.0
                rmse = np.sqrt(np.average(diff,axis=1))
                chi2 = np.average(diff/ees**2,axis=1)
                viol = [sum((np.where(diff/ees**2>1)[0])==y) for y in range(diff.shape[0])]
                rho = [ stats.spearmanr(bb[y,:],ee)[0] for y in range(diff.shape[0])]
                print("%11s %7s" % (lista2[i][j],ens),end=" ")
                print("RMSE: %4.2f+/-%-4.2f " % (np.average(rmse), np.std(rmse,ddof=1)), end=" ")
                print("Chi2: %4.2f+/-%-4.2f " % (np.average(chi2), np.std(chi2,ddof=1)), end=" ")
                print("rho: %4.2f+/-%-4.2f " % (np.average(rho), np.std(rho,ddof=1)), end=" ")
                print("viol: %3d+/-%-3.0f " % (np.average(viol), np.std(viol,ddof=1)))
                
                #print()
        #fig.show()
        #plt.close()
        print("")

       eNOE  set_MD RMSE: 0.58+/-0.02  Chi2: 1.90+/-0.21  rho: 0.84+/-0.02  viol:  19+/-1   
       eNOE  set_A1 RMSE: 0.32+/-0.04  Chi2: 0.57+/-0.14  rho: 0.94+/-0.01  viol:  10+/-4   
       eNOE   set_B RMSE: 0.62+/-0.04  Chi2: 2.20+/-0.11  rho: 0.82+/-0.02  viol:  24+/-1   
       eNOE   set_C RMSE: 0.56+/-0.00  Chi2: 1.78+/-0.10  rho: 0.85+/-0.02  viol:  18+/-1   
       eNOE   set_D RMSE: 0.59+/-0.03  Chi2: 2.01+/-0.24  rho: 0.85+/-0.02  viol:  20+/-2   
       eNOE    2koc RMSE: 0.46+/-nan   Chi2: 1.00+/-nan   rho: 0.91+/-nan   viol:  14+/-nan 
       eNOE    6by5 RMSE: 0.14+/-nan   Chi2: 0.13+/-nan   rho: 0.99+/-nan   viol:   1+/-nan 

eNOE_unidir  set_MD RMSE: 0.47+/-0.02  Chi2: 0.59+/-0.04  rho: 0.87+/-0.01  viol:  32+/-0   
eNOE_unidir  set_A1 RMSE: 0.38+/-0.02  Chi2: 0.42+/-0.04  rho: 0.91+/-0.00  viol:  22+/-4   
eNOE_unidir   set_B RMSE: 0.48+/-0.02  Chi2: 0.62+/-0.03  rho: 0.88+/-0.01  viol:  36+/-2   
eNOE_unidir   set_C RMSE: 0.46+/-0.01  Chi2: 0.56+/-0.02  rho: 0.88+/

Now we plot the experimental data against the back-calculated averages. Large violations in MD are listed, sorted by the deviation (relative to the error)