# Estimating Volumes
What is a good resolution to accurately estimate the volume of the central group?

In [None]:
import sys
sys.path.append('..//scripts//')

# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib notebook

import matplotlib.pyplot as plt

import time
import csv

import pandas as pd
import numpy as np

from classes.Settings import Settings
from classes.Radii import Radii

from constants.paths import WORKDIR

from helpers.density_helpers import find_available_volume

from calc_avg_fragment import calc_avg_frag

In [None]:
resolutions = []
resolutions0 = np.arange(0.02, 0.1, 0.01)
resolutions.extend(resolutions0)

resolutions1 = np.arange(0.1, 1.1, 0.1)
resolutions.extend(resolutions1)

resolutions.reverse()

tolerance = 0.5

central_groups = {"H2O": ["H", "C"], "NO3": ["C"], "RC6H5": ["C"], "RC6F5": ["C"]}

# Here come the long calculations

In [None]:
tolerance = 0.5

rerun = False

if rerun:
    # open file and write headers
    with open("../../results/binsize_for_volume.csv", 'w', newline='') as resultsFile:
        writer = csv.writer(resultsFile, delimiter=',')

        writer.writerow(['central', 'contact', 'atom', 'res', 'Vmax', 'V', 'comptime'])
    
    for res in resolutions:
        for cg in central_groups.keys():
            settings = Settings(WORKDIR, f"..\\data\\{cg}\\{cg}_CF_vdw.5\\{cg}_CF_vdw.5.csv")
            radii = Radii(settings.get_radii_csv_name())
            
            for atom in central_groups[cg]:      
            
                starttime = time.time()
                settings.set_resolution(round(res,2))
                
                df = pd.read_csv(settings.get_aligned_csv_filename(), header=0)
                avg_frag = calc_avg_frag(df, settings, radii)
                
                contact_group_radius = radii.get_vdw_distance_contact(atom)
                
                volume_max = find_available_volume(avg_fragment=avg_frag, extra=(tolerance + contact_group_radius), total=True, resolution=res)
                volume_central = find_available_volume(avg_fragment=avg_frag, extra=0, total=True, resolution=res)
                
                comptime = time.time() - starttime

                # save results
                with open("../../results/binsize_for_volume.csv", 'a', newline='') as resultsFile:
                    writer = csv.writer(resultsFile, delimiter=',')

                    writer.writerow([cg, 'CF', atom, res, volume_max, volume_central, comptime])
                    
                print(cg, 'CF', atom, res, volume_max, volume_central, comptime)

## Resultaten plotten

In [None]:
df = pd.read_csv("../../results/binsize_for_volume.csv")

# reference values
real_water = 16.85
real_nitrate = 40.0872
real_aryl = 89.0991
real_pentafluoroaryl = 122.2819

real_water_H_max = 137.178
real_water_max = 232.971
real_nitrate_max = 365.532
real_aryl_max = 577.290
real_pentafluoroaryl_max = 728.153

# calculate the differences
real_water_H_diff = real_water_H_max - real_water
real_water_C_diff = real_water_max - real_water
real_nitrate_diff = real_nitrate_max - real_nitrate
real_aryl_diff = real_aryl_max - real_aryl
real_pentafluoroaryl_diff = real_pentafluoroaryl_max - real_pentafluoroaryl

In [None]:
display(df)

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
plt.grid(True)
plt.title("Volume of $H_2O$ as central group, with different resolutions\n Contact group: CF")

# plot h2o - c
h2o_df_max = df[(df.central == "H2O") & (df.atom == "C")]
ax.scatter(h2o_df_max.res, h2o_df_max.Vmax, label="$H_2O$ + 0.5 + vdW 1.7", color='tab:blue')
ax.plot(h2o_df_max.res, h2o_df_max.Vmax, color='tab:blue')

# plot h2o - h
h2o_df_max_H = df[(df.central == "H2O") & (df.atom == "H")]
ax.scatter(h2o_df_max_H.res, h2o_df_max_H.Vmax, label="$H_2O$ + 0.5 + vdW 1.09", color='tab:green')
ax.plot(h2o_df_max_H.res, h2o_df_max_H.Vmax, color='tab:green')

# plot h2o
h2o_df = df[(df.central == "H2O") & (df.atom == "H")]
ax.scatter(h2o_df.res, h2o_df.V, label="$H_2O$", color='tab:red')
ax.plot(h2o_df.res, h2o_df.V, color='tab:red')

ax.set_ylabel(r'Volume ($\AA^3$)')
ax.set_xlabel("Resolution")
ax.legend(loc='upper right')

plt.savefig('../../results/plots/volumes.png', format='png')

In [None]:
# calculate percentage error
df['percentage_error'] = 100
df['per_err_central'] = 100

df.loc[(df.central == "H2O") & (df.atom == "H"), 'percentage_error'] = (df[(df.central == "H2O") & (df.atom == "H")]['Vmax'] - real_water_H_max)/real_water_H_max * 100
df.loc[(df.central == "H2O") & (df.atom == "C"), 'percentage_error'] = (df[(df.central == "H2O") & (df.atom == "C")]['Vmax'] - real_water_max)/real_water_max * 100

df.loc[(df.central == "NO3"), 'percentage_error'] = abs((df[(df.central == "NO3")]['Vmax'] - real_nitrate_max)/real_nitrate_max) * 100
df.loc[(df.central == "RC6H5"), 'percentage_error'] = abs((df[(df.central == "RC6H5")]['Vmax'] - real_aryl_max)/real_aryl_max) * 100
df.loc[(df.central == "RC6F5"), 'percentage_error'] = abs((df[(df.central == "RC6F5")]['Vmax'] - real_pentafluoroaryl_max)/real_pentafluoroaryl_max) * 100

################################## calculate of only central group

df.loc[(df.central == "H2O") & (df.atom == "H"), 'per_err_central'] = (df[(df.central == "H2O") & (df.atom == "H")]['V'] - real_water)/real_water * 100
df.loc[(df.central == "H2O") & (df.atom == "C"), 'per_err_central'] = (df[(df.central == "H2O") & (df.atom == "C")]['V'] - real_water)/real_water * 100

df.loc[(df.central == "NO3"), 'per_err_central'] = abs((df[(df.central == "NO3")]['V'] - real_nitrate)/real_nitrate) * 100
df.loc[(df.central == "RC6H5"), 'per_err_central'] = abs((df[(df.central == "RC6H5")]['V'] - real_aryl)/real_aryl) * 100
df.loc[(df.central == "RC6F5"), 'per_err_central'] = abs((df[(df.central == "RC6F5")]['V'] - real_pentafluoroaryl)/real_pentafluoroaryl) * 100


################################# calculate difference

df['diff'] = df['Vmax'] - df['V']

df.loc[(df.central == "H2O") & (df.atom == "H"), 'diff_perc'] = (df[(df.central == "H2O") & (df.atom == "H")]['diff'] - real_water_H_diff)/real_water_H_diff * 100
df.loc[(df.central == "H2O") & (df.atom == "C"), 'diff_perc'] = (df[(df.central == "H2O") & (df.atom == "C")]['diff'] - real_water_C_diff)/real_water_C_diff * 100

df.loc[(df.central == "NO3"), 'diff_perc'] = abs((df[(df.central == "NO3")]['diff'] - real_nitrate_diff)/real_nitrate_diff) * 100
df.loc[(df.central == "RC6H5"), 'diff_perc'] = abs((df[(df.central == "RC6H5")]['diff'] - real_aryl_diff)/real_aryl_diff) * 100
df.loc[(df.central == "RC6F5"), 'diff_perc'] = abs((df[(df.central == "RC6F5")]['diff'] - real_pentafluoroaryl_diff)/real_pentafluoroaryl_diff) * 100

display(df)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,4))
ax1.grid(True)
ax2.grid(True)
plt.suptitle("Volume central groups, contact group: CF")

h2o_df_max = df[(df.central == "H2O") & (df.atom == "C")]
h2o_df_max_H = df[(df.central == "H2O") & (df.atom == "H")]
no3 = df[(df.central == "NO3")]
rc6h5 = df[(df.central == "RC6H5")]
rc6f5 = df[(df.central == "RC6F5")]

# plot percental errors
ax1.scatter(h2o_df_max.res, h2o_df_max.percentage_error, label="$H_2O$-C", color='green')
ax1.plot(h2o_df_max.res, h2o_df_max.percentage_error, color='green')

ax1.scatter(h2o_df_max_H.res, h2o_df_max_H.percentage_error, label="$H_2O$-H", color='blue')
ax1.plot(h2o_df_max_H.res, h2o_df_max_H.percentage_error, color='blue')

ax1.scatter(no3.res, no3.percentage_error, label="$NO_3^-$", color='red')
ax1.plot(no3.res, no3.percentage_error, color='red')

ax1.scatter(rc6h5.res, rc6h5.percentage_error, label="$RC_6H_5$", color='orchid')
ax1.plot(rc6h5.res, rc6h5.percentage_error, color='orchid')

ax1.scatter(rc6f5.res, rc6f5.percentage_error, label="$RC_6F_5$", color='yellow')
ax1.plot(rc6f5.res, rc6f5.percentage_error, color='yellow')


# ax1.set_ylim(0, 10)
# ax1.set_xlim(0, 0.1)

# plot time
ax2.set_yscale('log')

ax2.scatter(h2o_df_max.res, h2o_df_max.comptime, color='tab:green', label="$H_2O$-H time")
ax2.plot(h2o_df_max.res, h2o_df_max.comptime, color='tab:green')

ax2.scatter(h2o_df_max_H.res, h2o_df_max_H.comptime, color='tab:blue', label="$H_2O$-H")
ax2.plot(h2o_df_max_H.res, h2o_df_max_H.comptime, color='tab:blue')

ax2.scatter(no3.res, no3.comptime, color='tab:red', label='$NO_3^-$ time')
ax2.plot(no3.res, no3.comptime, color='tab:red')

ax2.scatter(rc6h5.res, rc6h5.comptime, color='tab:purple', label="$RC_6H_5$ time")
ax2.plot(rc6h5.res, rc6h5.comptime, color='tab:purple')

ax2.scatter(rc6f5.res, rc6f5.comptime, color='tab:orange', label="$RC_6F_5$ time")
ax2.plot(rc6f5.res, rc6f5.comptime, color='tab:orange')

ax2.set_xlim(0, 0.1)
ax2.vlines(0.03, 0, 100000, colors='black')

ax1.hlines(3, 0, 1, colors='black')
ax1.hlines(5, 0, 1, colors='black')

ax1.set_title('Percentage error')
ax2.set_title('Computational time')
ax2.set_ylabel('Computational time (seconds)')
ax2.set_xlabel('resolution')

ax1.set_ylabel(r'Percentage')
ax1.set_xlabel("resolution")

ax1.legend()
ax2.legend()

plt.subplots_adjust(wspace=0.3)
plt.savefig('../../results/plots/volumes_error.png', format='png')

In [None]:
plt.figure(figsize=(8,3.5))
plt.subplots_adjust(bottom=0.2)
plt.grid(True)
plt.title("Percentage error available volume\nTotal volume - central group volume, contact group CF")

h2o_df_max = df[(df.central == "H2O") & (df.atom == "C")]
h2o_df_max_H = df[(df.central == "H2O") & (df.atom == "H")]
no3 = df[(df.central == "NO3")]
rc6h5 = df[(df.central == "RC6H5")]
rc6f5 = df[(df.central == "RC6F5")]

# plot percental errors
plt.scatter(h2o_df_max_H.res, h2o_df_max_H.diff_perc, label="$H_2O$ + 0.5 + vdW 1.09 (H)", color='tab:blue')
plt.plot(h2o_df_max_H.res, h2o_df_max_H.diff_perc, color='tab:blue')

plt.scatter(h2o_df_max.res, h2o_df_max.diff_perc, label="$H_2O$ + 0.5 + vdW 1.7 (C)", color='tab:green')
plt.plot(h2o_df_max.res, h2o_df_max.diff_perc, color='tab:green')

plt.scatter(no3.res, no3.diff_perc, label="$NO_3^- + 0.5 + vdW 1.7 (C)$", color='tab:red')
plt.plot(no3.res, no3.diff_perc, color='tab:red')

plt.scatter(rc6h5.res, rc6h5.diff_perc, label="$RC_6H_5 + 0.5 + vdW 1.7 (C)$", color='tab:purple')
plt.plot(rc6h5.res, rc6h5.diff_perc, color='tab:purple')

plt.scatter(rc6f5.res, rc6f5.diff_perc, label="$RC_6F_5 + 0.5 + vdW 1.7 (C)$", color='tab:orange')
plt.plot(rc6f5.res, rc6f5.diff_perc, color='tab:orange')

plt.hlines(3, 0, 1, colors='grey', linestyles='dashed')
plt.hlines(5, 0, 1, colors='grey', linestyles='dashed')

plt.ylim(0, 15)
plt.xlim(0, 0.5)

plt.ylabel('Percentage error')
plt.xlabel("Resolution")
plt.legend(loc='lower right')

plt.savefig('../../results/plots/volumes_error.svg', format='svg')

# Conclusion
We can use a resolution of 0.1 to estimate the available volume.

# Minor check for METHYL volume

In [None]:
avg_f = pd.read_csv("..\\..\\results\\pairs\\RCOMe\\RCOMe_R2CO_vdw.5\\RCOMe_R2CO_kmeans_ch3_test_avg_fragment.csv")
V_kmeans = find_available_volume(avg_fragment=avg_f, extra=0, total=True)
print(V_kmeans)

In [None]:
avg_f = pd.read_csv("..\\..\\results\\pairs\\RCOMe\\RCOMe_R2CO_vdw.5\\RCOMe_R2CO_avg_fragment.csv")
V_20H = find_available_volume(avg_fragment=avg_f, extra=0, total=True)
print(V_20H)

In [None]:
avg_f = pd.read_csv("..\\..\\results\\pairs\\RCOMe\\RCOMe_R2CO_vdw.5\\RCOMe_R2CO_avg_fragment_model_3H.csv")
V_3H = find_available_volume(avg_fragment=avg_f, extra=0, total=True)
print(V_3H)

In [None]:
print("Differences")
print(f"20H model vs 3H model {abs((V_20H - V_3H) / V_3H) * 100 :.2f}%")
print(f"20H model vs kmeans model {abs((V_20H - V_kmeans) / V_kmeans) * 100 :.2f}%")
print(f"3H model vs kmeans model {abs((V_3H - V_kmeans) / V_kmeans) * 100 :.2f}%")