In [None]:
# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2

# show matplotlib plots in-line
%matplotlib notebook

import os
import time
import csv
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import sys

# so we can import scripts from the scripts folder, although it is not a child repository
sys.path.append('..//scripts//')

from helpers.density_helpers import count_points_per_square, prepare_df
from constants.paths import WORKDIR
from classes.Settings import Settings
from classes.Radii import Radii

from calc_avg_fragment import calc_avg_frag

In [None]:
def sample_df(amount, df):
    """ deze functie pakt aantal/percentage random uit lijst """
    
    df = df.sample(n=amount)
    
    
    assert len(df) == amount, "Sampling went wrong"
    
    return df

In [None]:
volumes = pd.read_csv('../../results/volumes_free.csv')

def compression(central, contact, contact_rp, amounts, resolution, runs):               
    cluster_frac = 0.10

    datafile = "..\\data\\" + central + "\\" + central + "_" + contact + "_vdw.5.cor"
    
    
    settings = Settings(WORKDIR, datafile)
    settings.set_contact_reference_point(contact_rp)
    settings.set_resolution(round(resolution, 2))

    df = pd.read_csv(settings.get_structure_csv_filename())

    print(f"Structures {len(df)}")
    coordinate_df = pd.read_hdf(settings.get_coordinate_df_filename(), settings.get_coordinate_df_key())
    
    display(coordinate_df)
    
    display(df)
    df = df[df.index.isin(list(coordinate_df.fragment_id))]

    print(f"Coordinate df {len(coordinate_df)}")
    aligned_fragments_df = pd.read_csv(settings.get_aligned_csv_filename())

    for run in range(runs):
        for amount in amounts:       
            print(f"Amount: {amount}, Run: {run}")
            
            # grab random structures
            sampled_df = sample_df(amount, df)
            structure_indices = sampled_df.index.to_list()

            # select rows from dfs
            coordinate_sampled = coordinate_df[coordinate_df.fragment_id.isin(structure_indices)]
            
            assert len(coordinate_sampled) == amount, "Sampling went wrong" + str(len(coordinate_sampled)) + " " + str(amount)
            aligned_sampled = aligned_fragments_df[(aligned_fragments_df.fragment_id.isin(structure_indices)) & (aligned_fragments_df.label != "-")]

            # make radii object to get vdw radii
            radii = Radii(settings.get_radii_csv_name())

            # calc new avg fragment
            fragment = calc_avg_frag(aligned_sampled, settings, radii)
            
            empty_density_df = prepare_df(df=coordinate_sampled, settings=settings)
            print(f"Amount in empty density df {empty_density_df[contact_rp].sum()}")
            density_df = count_points_per_square(df=empty_density_df, contact_points_df=coordinate_sampled, settings=settings)
            
            density_df['datafrac_normalized'] = density_df[contact_rp] / density_df[contact_rp].sum()
            
            print(f"Normalized datafrac: {density_df['datafrac_normalized'].sum()}")
            print(f"Amount in density df {density_df[contact_rp].sum()}")

            threshold = density_df.datafrac_normalized.max() * cluster_frac

            in_cluster = density_df[density_df.datafrac_normalized >= threshold]
            Vavailable = volumes.loc[(volumes.central == central) & (volumes.contact == contact) & (volumes.contact_rp == contact_rp), 'volume'].item()  

            datafrac = in_cluster.datafrac_normalized.sum()
            Vcluster = len(in_cluster) * resolution**3

            directionality = datafrac / Vcluster * (Vavailable/2)        

            with open('../../results/compression_results.csv', 'a', newline="") as resultsfile:
                writer = csv.writer(resultsfile)
                writer.writerow([central, contact, contact_rp, resolution, cluster_frac, run, amount, datafrac, Vcluster, Vavailable, directionality])

            print(central, contact, contact_rp, resolution, cluster_frac, run, amount, directionality)
            print('\n')

In [None]:
run_again = True

central1 = "H2O" # 559303 structures
contact1= "XH"
# contact_rp1 = "H"
contact_rp1 = "O" # 460377

central2 = "RC6H5" # 445710 structures
contact2 = "CCH3"
contact_rp2 = "H"

amounts = [50, 100, 250, 500, 750,
            1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000] #, 
#             15000, 20000, 30000, 40000, 50000, 
#             100000, 150000, 200000, 250000, 300000, 350000, 400000]

if run_again:
    compression(central1, contact1, contact_rp1, amounts, resolution=0.3, runs=50)
    compression(central1, contact1, contact_rp1, amounts, resolution=0.4, runs=50)
    
    compression(central2, contact2, contact_rp2, amounts, resolution=0.25, runs=50)
    compression(central2, contact2, contact_rp2, amounts, resolution=0.35, runs=50)
    compression(central2, contact2, contact_rp2, amounts, resolution=0.45, runs=50)
    
    compression(central1, contact1, contact_rp1, amounts, resolution=0.25, runs=50)
    compression(central1, contact1, contact_rp1, amounts, resolution=0.35, runs=50)
    compression(central1, contact1, contact_rp1, amounts, resolution=0.45, runs=50)
#     compression(central1, contact1, contact_rp1, [460377] , 1)
#     compression(central2, contact2, contact_rp2, amounts1, 100)

# Check CI intervals

In [None]:
tru_h2o_xh_o_02 = 11.032713274285268
tru_h2o_xh_o_03 = 7.842835714757644
tru_h2o_xh_o_04 = 7.231629649434552
tru_h2o_xh_o_05 = 5.058302741738474

tru_rc6h5_cch3_h_02 = 2.174874888230874
tru_rc6h5_cch3_h_03 = 3.772238599939169
tru_rc6h5_cch3_h_04 = 1.1728635272529497
tru_rc6h5_cch3_h_05 = 1.6245673292050886

In [None]:
df = pd.read_csv('../../results/compression_results.csv', header=0)

display(df)

In [None]:
rc6h5_05 = df[(df.central == "RC6H5") & (df.resolution == 0.5)].copy()
rc6h5_04 = df[(df.central == "RC6H5") & (df.resolution == 0.4)].copy()
rc6h5_03 = df[(df.central == "RC6H5") & (df.resolution == 0.3)].copy()
rc6h5_02 = df[(df.central == "RC6H5") & (df.resolution == 0.2)].copy()

h2o_05 = df[(df.central == "H2O") & (df.resolution == 0.5)].copy()
h2o_04 = df[(df.central == "H2O") & (df.resolution == 0.4)].copy()
h2o_03 = df[(df.central == "H2O") & (df.resolution == 0.3)].copy()
h2o_02 = df[(df.central == "H2O") & (df.resolution == 0.2)].copy()

In [None]:
stats_rc6h5_02 = rc6h5_02.groupby("amount")["directionality"].agg(["count", "std", "mean"]).reset_index()
stats_rc6h5_03 = rc6h5_03.groupby("amount")["directionality"].agg(["count", "std", "mean"]).reset_index()
stats_rc6h5_04 = rc6h5_04.groupby("amount")["directionality"].agg(["count", "std", "mean"]).reset_index()
stats_rc6h5_05 = rc6h5_05.groupby("amount")["directionality"].agg(["count", "std", "mean"]).reset_index()

stats_h2o_02 = h2o_02.groupby("amount")["directionality"].agg(["count", "std", "mean"]).reset_index()
stats_h2o_03 = h2o_03.groupby("amount")["directionality"].agg(["count", "std", "mean"]).reset_index()
stats_h2o_04 = h2o_04.groupby("amount")["directionality"].agg(["count", "std", "mean"]).reset_index()
stats_h2o_05 = h2o_05.groupby("amount")["directionality"].agg(["count", "std", "mean"]).reset_index()

stats_rc6h5_02.columns = ["amount", "count", "std", "average"]
stats_rc6h5_03.columns = ["amount", "count", "std", "average"]
stats_rc6h5_04.columns = ["amount", "count", "std", "average"]
stats_rc6h5_05.columns = ["amount", "count", "std", "average"]

stats_h2o_02.columns = ["amount", "count", "std", "average"]
stats_h2o_03.columns = ["amount", "count", "std", "average"]
stats_h2o_04.columns = ["amount", "count", "std", "average"]
stats_h2o_05.columns = ["amount", "count", "std", "average"]

stats_rc6h5_02["percental"] = abs((stats_rc6h5_02.average - tru_rc6h5_cch3_h_02)) / tru_rc6h5_cch3_h_02 * 100
stats_rc6h5_03["percental"] = abs((stats_rc6h5_03.average - tru_rc6h5_cch3_h_03)) / tru_rc6h5_cch3_h_03 * 100
stats_rc6h5_04["percental"] = abs((stats_rc6h5_04.average - tru_rc6h5_cch3_h_04)) / tru_rc6h5_cch3_h_04 * 100
stats_rc6h5_05["percental"] = abs((stats_rc6h5_05.average - tru_rc6h5_cch3_h_05)) / tru_rc6h5_cch3_h_05 * 100

stats_h2o_02["percental"] = abs((stats_h2o_02.average - tru_h2o_xh_o_02)) / tru_h2o_xh_o_02 * 100
stats_h2o_03["percental"] = abs((stats_h2o_03.average - tru_h2o_xh_o_03)) / tru_h2o_xh_o_03 * 100
stats_h2o_04["percental"] = abs((stats_h2o_04.average - tru_h2o_xh_o_04)) / tru_h2o_xh_o_04 * 100
stats_h2o_05["percental"] = abs((stats_h2o_05.average - tru_h2o_xh_o_05)) / tru_h2o_xh_o_05 * 100

In [None]:
fig, ax = plt.subplots(figsize=(10,3))
fig.subplots_adjust(bottom=0.17)

plt.title("Compression algorithm")
plt.grid(True)
plt.xlabel("Amount of random datapoints")
plt.ylabel("Directionality - percental difference")

ax.scatter(stats_rc6h5_05.amount, stats_rc6h5_05.percental, color='rebeccapurple', label="RC6H5-CCH3(H), res: 0.5")
ax.plot(stats_rc6h5_05.amount, stats_rc6h5_05.percental, color='rebeccapurple')

ax.scatter(stats_h2o_05.amount, stats_h2o_05.percental, color='blue', label="H2O-OH(O), res: 0.5")
ax.plot(stats_h2o_05.amount, stats_h2o_05.percental, color='blue')

ax.scatter(stats_rc6h5_02.amount, stats_rc6h5_02.percental, color='green', label="RC6H5-CCH3(H), res: 0.2")
ax.plot(stats_rc6h5_02.amount, stats_rc6h5_02.percental, color='green')

ax.scatter(stats_h2o_02.amount, stats_h2o_02.percental, color='red', label="H2O-OH(O), res: 0.2")
ax.plot(stats_h2o_02.amount, stats_h2o_02.percental, color='red')
 
ax.legend()

plt.show()
# plt.savefig('../../results/CI_compression_H2O_XH_H.png')


In [None]:
fig, ax = plt.subplots(figsize=(8,3))
fig.subplots_adjust(bottom=0.17)

plt.title("Compression algorithm")

plt.xlabel("Amount of random datapoints")
plt.ylabel("Directionality - percental difference")

ax.scatter(stats_rc6h5_05.amount, stats_rc6h5_05.percental, color='rebeccapurple', label="RC6H5-CCH3(H), res: 0.5")
ax.plot(stats_rc6h5_05.amount, stats_rc6h5_05.percental, color='rebeccapurple')

ax.scatter(stats_h2o_05.amount, stats_h2o_05.percental, color='blue', label="H2O-OH(O), res: 0.5")
ax.plot(stats_h2o_05.amount, stats_h2o_05.percental, color='blue')

ax.scatter(stats_rc6h5_02.amount, stats_rc6h5_02.percental, color='green', label="RC6H5-CCH3(H), res: 0.2")
ax.plot(stats_rc6h5_02.amount, stats_rc6h5_02.percental, color='green')

ax.scatter(stats_h2o_02.amount, stats_h2o_02.percental, color='red', label="H2O-OH(O), res: 0.2")
ax.plot(stats_h2o_02.amount, stats_h2o_02.percental, color='red')
 
major_ticks_x = np.arange(0, 500000, 10000)
minor_ticks_x = np.arange(0, 500000, 5000)

major_ticks_y = np.arange(0, 101, 10)
minor_ticks_y = np.arange(0, 101, 5)

ax.set_xticks(major_ticks_x)
ax.set_xticks(minor_ticks_x, minor=True)
ax.set_yticks(major_ticks_y)
ax.set_yticks(minor_ticks_y, minor=True)

# Or if you want different settings for the grids:
ax.grid(which='minor', alpha=0.2)
ax.grid(which='major', alpha=0.5)

ax.set_ylim(-2, 40)
ax.set_xlim(0, 100000)

ax.legend()

plt.vlines(5000, 0, 50, color="black")

plt.show()
plt.savefig('../../results/compression.png')


In [None]:
fig, ax = plt.subplots(figsize=(8,3))
fig.subplots_adjust(bottom=0.17)

plt.title("Compression algorithm")

plt.xlabel("Amount of random datapoints")
plt.ylabel("Directionality - percental difference")



ax.scatter(stats_rc6h5_02.amount, stats_rc6h5_02.percental, color='green', label="RC6H5-CCH3(H), res: 0.2")
ax.plot(stats_rc6h5_02.amount, stats_rc6h5_02.percental, color='green')


ax.scatter(stats_rc6h5_03.amount, stats_rc6h5_03.percental, color='cyan', label="RC6H5-CCH3(H), res: 0.3")
ax.plot(stats_rc6h5_03.amount, stats_rc6h5_03.percental, color='cyan')

ax.scatter(stats_rc6h5_04.amount, stats_rc6h5_04.percental, color='yellow', label="RC6H5-CCH3(H), res: 0.4")
ax.plot(stats_rc6h5_04.amount, stats_rc6h5_04.percental, color='yellow')

ax.scatter(stats_rc6h5_05.amount, stats_rc6h5_05.percental, color='rebeccapurple', label="RC6H5-CCH3(H), res: 0.5")
ax.plot(stats_rc6h5_05.amount, stats_rc6h5_05.percental, color='rebeccapurple')


ax.scatter(stats_h2o_02.amount, stats_h2o_02.percental, color='red', label="H2O-OH(O), res: 0.2")
ax.plot(stats_h2o_02.amount, stats_h2o_02.percental, color='red')

ax.scatter(stats_h2o_05.amount, stats_h2o_05.percental, color='blue', label="H2O-OH(O), res: 0.5")
ax.plot(stats_h2o_05.amount, stats_h2o_05.percental, color='blue')
 
# major_ticks_x = np.arange(0, 500000, 10000)
# minor_ticks_x = np.arange(0, 500000, 5000)

# major_ticks_y = np.arange(0, 101, 10)
# minor_ticks_y = np.arange(0, 101, 5)

# ax.set_xticks(major_ticks_x)
# ax.set_xticks(minor_ticks_x, minor=True)
# ax.set_yticks(major_ticks_y)
# ax.set_yticks(minor_ticks_y, minor=True)

# Or if you want different settings for the grids:
ax.grid(which='minor', alpha=0.2)
ax.grid(which='major', alpha=0.5)

ax.set_ylim(-2, 100)
ax.set_xlim(0, 10000)

ax.legend(loc='upper right')

plt.vlines(5000, 0, 50, color="black")

plt.show()
plt.savefig('../../results/compression.png')


In [None]:
stats_rc6h5_04


In [None]:
fig, ax = plt.subplots(figsize=(8,3))
fig.subplots_adjust(bottom=0.17)

plt.title("Compression algorithm")

plt.xlabel("Amount of random datapoints")
plt.ylabel("Directionality - absolute score")

ax.scatter(stats_rc6h5_05.amount, stats_rc6h5_05.average, color='rebeccapurple', label="RC6H5-CCH3(H), res: 0.5")
ax.plot(stats_rc6h5_05.amount, stats_rc6h5_05.average, color='rebeccapurple')

ax.scatter(stats_rc6h5_04.amount, stats_rc6h5_04.average, color='yellow', label="RC6H5-CCH3(H), res: 0.4")
ax.plot(stats_rc6h5_04.amount, stats_rc6h5_04.average, color='yellow')

ax.scatter(stats_rc6h5_03.amount, stats_rc6h5_03.average, color='cyan', label="RC6H5-CCH3(H), res: 0.3")
ax.plot(stats_rc6h5_03.amount, stats_rc6h5_03.average, color='cyan')

ax.scatter(stats_rc6h5_02.amount, stats_rc6h5_02.average, color='green', label="RC6H5-CCH3(H), res: 0.2")
ax.plot(stats_rc6h5_02.amount, stats_rc6h5_02.average, color='green')



ax.scatter(stats_h2o_05.amount, stats_h2o_05.average, color='blue', label="H2O-OH(O), res: 0.5")
ax.plot(stats_h2o_05.amount, stats_h2o_05.average, color='blue')

# ax.scatter(stats_h2o_04.amount, stats_h2o_04.average, color='pink', label="H2O-OH(O), res: 0.4")
# ax.plot(stats_h2o_04.amount, stats_h2o_04.average, color='pink')

# ax.scatter(stats_h2o_03.amount, stats_h2o_03.average, color='black', label="H2O-OH(O), res: 0.3")
# ax.plot(stats_h2o_03.amount, stats_h2o_03.average, color='black')

ax.scatter(stats_h2o_02.amount, stats_h2o_02.average, color='red', label="H2O-OH(O), res: 0.2")
ax.plot(stats_h2o_02.amount, stats_h2o_02.average, color='red')
 
# major_ticks_x = np.arange(0, 500000, 10000)
# minor_ticks_x = np.arange(0, 500000, 5000)

# major_ticks_y = np.arange(0, 101, 10)
# minor_ticks_y = np.arange(0, 101, 5)

# ax.set_xticks(major_ticks_x)
# ax.set_xticks(minor_ticks_x, minor=True)
# ax.set_yticks(major_ticks_y)
# ax.set_yticks(minor_ticks_y, minor=True)

# Or if you want different settings for the grids:
ax.grid(which='minor', alpha=0.2)
ax.grid(which='major', alpha=0.5)

ax.set_ylim(-2, 50)
ax.set_xlim(0, 10000)

ax.legend(loc='upper right')

plt.vlines(5000, 0, 50, color="black")

plt.show()
plt.savefig('../../results/compression.png')
