In [None]:
import sys

sys.path.append('..//scripts//')

In [None]:
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import os
import time
import numpy as np
import csv
import pandas as pd

from helpers.geometry_helpers import average_fragment

from helpers.density_helpers import count_points_per_square, prepare_df
from constants.paths import WORKDIR, RADII_CSV
from classes.Settings import Settings, Radii

In [None]:
volumes_free = pd.read_csv('../../results/volumes_free.csv')
display(volumes_free)

In [None]:
def sample_df(amount, df):
    """ deze functie pakt aantal/percentage random uit lijst """
    
    df = df.sample(n=amount)
    
    return df

In [None]:
central_group = "H2O"

contact_group = "XH"
to_count = "O"

runs = 1000

amounts = np.arange(50000, 150000, 50000)

resolution = 0.50
cluster_frac = 0.25
volumes = volumes_free

datafile = "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"

# settings for filenames etc.
settings = Settings(WORKDIR, datafile)
settings.set_atom_to_count(to_count)
settings.set_resolution(round(resolution, 2))

df = pd.read_csv(settings.get_structure_csv_filename())
coordinate_df = pd.read_hdf(settings.get_coordinate_df_filename(), settings.get_coordinate_df_key())
aligned_fragments_df = pd.read_csv(settings.get_aligned_csv_filename())

for amount in amounts:
   
    for i, run in enumerate(range(runs)):
        sampled_df = sample_df(amount, df)

        structure_indices = sampled_df.index.to_list()

        # select rows from dfs
        coordinate_sampled = coordinate_df[coordinate_df.fragment_id.isin(structure_indices)]
        aligned_sampled = aligned_fragments_df[(aligned_fragments_df.fragment_id.isin(structure_indices)) & (aligned_fragments_df.label != "-")]

        # make radii object to get vdw radii
        radii = Radii(RADII_CSV)

        # calc new avg fragment
        fragment = average_fragment(aligned_sampled, settings, radii)
        
        key = "resolution" + str(resolution).replace('.', '') + "compression" + str(amount) + "i" + str(i)
        try:
            
            density_df = pd.read_hdf("compression_test_H2O_XH_O.hdf", key)
            print("Density df already existed, loaded from file")
            
            continue
        except (FileNotFoundError, KeyError):
            empty_density_df = prepare_df(df=coordinate_sampled, settings=settings)
            density_df = count_points_per_square(df=empty_density_df, contact_points_df=coordinate_sampled, settings=settings)
            density_df.to_hdf("compression_test_H2O_XH_O.hdf", key)

        density_df['datafrac_normalized'] = density_df[to_count] / density_df[to_count].sum()

        threshold = density_df.datafrac_normalized.max() * cluster_frac

        in_cluster = density_df[density_df.datafrac_normalized >= threshold]
        Vavailable = volumes.loc[(volumes.central == central_group) & (volumes.contact == contact_group) & (volumes.to_count == to_count), 'volume'].item()  
        
        datafrac = in_cluster.datafrac_normalized.sum()
        Vcluster = len(in_cluster) * resolution**3

        directionality = datafrac / Vcluster * Vavailable        
        
        with open('results.csv', 'a', newline="") as resultsfile:
            writer = csv.writer(resultsfile)
            writer.writerow([amount, directionality])
        
        print(i, run, directionality)

# Check CI intervals

In [None]:
df = pd.read_csv('results.csv', header=None)
df.columns = ["datapoints", "directionality"]
df = df[~df.index.isin(df[df.datapoints == 350000][1000:].index)]
df = df[~df.index.isin(df[df.datapoints == 400000][1000:].index)]

display(df)

In [None]:
stats = df.groupby("datapoints").agg(["count", "std", "mean"]).reset_index()
stats.columns = ["datapoints", "count", "std", "average"]

ci95_hi = []
ci95_lo = []

ci99_hi = []
ci99_lo = []

for i, row in stats.iterrows():
    count, std, mean = float(row["count"]), float(row["std"]), float(row["average"])
    ci95_hi.append(mean + 1.96*std/np.sqrt(count))
    ci95_lo.append(mean - 1.96*std/np.sqrt(count))
    
    ci99_hi.append(mean + 2.58*std/np.sqrt(count))
    ci99_lo.append(mean - 2.58*std/np.sqrt(count))

stats['ci95_hi'] = ci95_hi
stats['ci95_lo'] = ci95_lo

stats['ci99_hi'] = ci99_hi
stats['ci99_lo'] = ci99_lo

stats

In [None]:
fig, ax = plt.subplots(figsize=(6,3))
fig.subplots_adjust(bottom=0.17)

plt.title("Compression algorithm 99% confidence intervals")
plt.grid(True)
plt.xlabel("Amount of random datapoints (*10000)")
plt.ylabel("Directionality")

ax.errorbar(stats.datapoints, stats.average, fmt='o', yerr=(stats.average - stats.ci99_lo), color='rebeccapurple', capsize=5)
labels = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
ax.set_xticks([x*10000 for x in labels])
ax.set_xticklabels(labels=labels)

plt.show()
plt.savefig('../../results/CI_compression_H2O_XH_H.png')
# ax.fill_between(stats.datapoints, stats.ci95_lo, stats.ci95_hi, color='r', alpha=.1)
# ax.fill_between(stats.datapoints, stats.ci99_lo, stats.ci99_hi, color='b', alpha=.1)

In [None]:
fig, ax = plt.subplots()

plt.title("Compression algorithm 99% confidence intervals")
plt.grid(True)
plt.xlabel("Amount of random datapoints (*10000)")
plt.ylabel("Directionality")

ax.scatter(stats.datapoints, stats.average, color="rebeccapurple")
ax.plot(stats.datapoints, stats.average, color="rebeccapurple")

ax.fill_between(stats.datapoints, stats.ci95_lo, stats.ci95_hi, color='purple', alpha=.2, label="99% CI")
ax.fill_between(stats.datapoints, stats.ci99_lo, stats.ci99_hi, color='yellow', alpha=.2, label="95% CI")

labels = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
ax.set_xticks([x*10000 for x in labels])
ax.set_xticklabels(labels=labels)

plt.legend()
plt.show()

plt.savefig('../../results/CI_compression_filled_H2O_XH_H.png')
