# Picking co-dependent variables
In this notebook, the influences of resolution and threshold on each other are investigated.

In [None]:
# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2

# show matplotlib plots in-line
%matplotlib notebook

import sys
import csv
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# so we can import scripts from the scripts folder, although it is not a child repository
sys.path.append('..//scripts//')

from helpers.density_helpers import find_available_volume, prepare_df
from classes.Settings import Settings
from classes.Radii import Radii

from constants.paths import WORKDIR

## Beneath we define the contact pairs

In [None]:
central_groups = ["H2O", "ArCI", "REt", "RNO2", "RCOMe", "NO3", "RC6F5", "RC6H5"]
contact_groups = ["ArCH", "C2CH2", "CCH3", "CF", "R2CO", "RC6H5", "RCN", "XH", "XH"]
contact_rps =       ["H",    "H",      "H",   "F", "O",     "centroid", "N", "H", "O"]

In [None]:
def calc_directionality(contact_rp, contact_group, central_group, cluster_frac, resolution, volumes):
    """ Calculates a directionality for a certain contact pair, on a given resolution and volume. """

    # get the datafile and make a settings object
    datafile = "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
    settings = Settings('..\..', datafile)
    settings.set_contact_reference_point(contact_rp)
    settings.set_resolution(resolution)
    
    # grab the calculated density df and normalize it
    density_df = pd.read_hdf(settings.get_density_df_filename(), settings.get_density_df_key())
    density_df['datafrac_normalized'] = density_df[contact_rp] / density_df[contact_rp].sum()
    
    # calculate threshold and the bins that are in the cluster according to that threshold
    threshold = density_df.datafrac_normalized.max() * cluster_frac
    in_cluster = density_df[density_df.datafrac_normalized >= threshold]
    datafrac = in_cluster.datafrac_normalized.sum()
    Vcluster = len(in_cluster) * resolution**3
    
    # get the available volume and calculate the directionality
    Vavailable = volumes.loc[(volumes.central == central_group) & (volumes.contact == contact_group) & (volumes.contact_rp == contact_rp), 'volume'].item()  
    directionality = datafrac / Vcluster * (Vavailable/2)
    
    return directionality


In [None]:
def points_in_cluster(contact_rp, contact_group, central_group, cluster_frac, resolution):
    """ Calculates the amount of data that is in a cluster, given a certain cluster_frac. """

    # get the datafile and make a settings object
    datafile = "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
    settings = Settings('..\..', datafile)
    settings.set_contact_reference_point(contact_rp)
    settings.set_resolution(resolution)
    
    # grab the calculated density df and normalize it
    density_df = pd.read_hdf(settings.get_density_df_filename(), settings.get_density_df_key())
    density_df['datafrac_normalized'] = density_df[contact_rp] / density_df[contact_rp].sum()
    
    # calculate the threshold and the bins that are in the cluster
    threshold = density_df.datafrac_normalized.max() * cluster_frac
    in_cluster = density_df[density_df.datafrac_normalized >= threshold]
    
    return in_cluster.datafrac_normalized.sum(), in_cluster[contact_rp].sum()


In [None]:
volumes_free = pd.read_csv('../../results/volumes_free.csv')
volumes_total = pd.read_csv('../../results/volumes_total.csv')

# Directionality ~ Resolution

In [None]:
# make a folder specifically for this purpose: it creates a lot of plots
if not os.path.exists('../../results/dependencies/'):
    os.mkdir('../../results/dependencies/')

In [None]:
# run it for different thresholds and resolutions
cluster_fracs = [0.10,0.25,0.40]
resolutions = np.arange(0.1, 1.55, 0.05)

again = True

try:
    if again:
        raise FileNotFoundError
    
    # if already exists and not specifically said that we must do it again, read the datafile.
    df = pd.read_csv('../../results/directionality_dependency_resolution.csv', header=None)
except FileNotFoundError:
    with open('../../results/directionality_dependency_resolution.csv', 'w', newline="") as resultsfile:
        csvwriter = csv.writer(resultsfile)
        csvwriter.writerow(["central", "contact", "contact_rp", "frac", "res", "directionality"])

    # loop over the cluster fracs
    for frac in cluster_fracs:
        
        # calculate for each pair
        for central_group in central_groups:
            for contact_rp, contact_group in zip(contact_rps, contact_groups):
                
                # for each resolution
                for resolution in resolutions:
                    directionality = calc_directionality(contact_rp, contact_group, central_group, frac, round(resolution, 2), volumes_free)

                    with open('../../results/directionality_dependency_resolution.csv', 'a', newline="") as resultsfile:
                        csvwriter = csv.writer(resultsfile)
                        csvwriter.writerow([central_group, contact_group, contact_rp, round(frac, 2), round(resolution, 2), directionality])

In [None]:
# display result
df = pd.read_csv('../../results/directionality_dependency_resolution.csv', header=0)
display(df)

### plot all the results

In [None]:
for frac in cluster_fracs:
    for central in central_groups:
        plt.figure(figsize=(8,4))
        plt.subplots_adjust(bottom=0.2)
        plt.grid(True)
        plt.title(f"Dependency of directionality on resolution {central}")

        for contact_rp, contact in zip(contact_rps, contact_groups):
            partdf = df[(df.frac == frac) & (df.central == central) & (df.contact == contact) & (df.contact_rp == contact_rp)]

            plt.plot(partdf.res, partdf.directionality)

            plt.scatter(partdf.res, partdf.directionality, label=contact + "-" + contact_rp)

        plt.ylabel("Directionality")

        plt.xlabel("Resolution")
        plt.xlim(0.09, 0.81)
        plt.legend(loc='upper right')
        plt.show()
        plt.savefig(f"../../results/dependencies/directionality_resolution_{central}_frac_{frac :.2f}_maxres08.png")

# Directionality ~ Threshold

In [None]:
# for different resolutions and cluster fracs
resolutions = [0.2, 0.25, 0.3]
cluster_fracs = np.arange(0.1, 1, 0.05)

again = True

try:
    if again:
        raise FileNotFoundError

    # if already exists and not specifically said that we must do it again, read the datafile
    df = pd.read_csv('../../results/directionality_dependency_threshold.csv', header=None)
except FileNotFoundError:
    with open('../../results/directionality_dependency_threshold.csv', 'w', newline="") as resultsfile:
        csvwriter = csv.writer(resultsfile)
        csvwriter.writerow(["central", "contact", "contact_rp", "res", "frac", "directionality"])

    # loop over the cluster fracs
    for frac in cluster_fracs:
        
        # calculate for each pair
        for central_group in central_groups:
            for contact_rp, contact_group in zip(contact_rps, contact_groups):
                
                # for each resolution
                for resolution in resolutions:
                    directionality = calc_directionality(contact_rp, contact_group, central_group, frac, round(resolution, 2), volumes_total)

                    with open('../../results/directionality_dependency_threshold.csv', 'a', newline="") as resultsfile:
                        csvwriter = csv.writer(resultsfile)
                        csvwriter.writerow([central_group, contact_group, contact_rp, round(resolution,2), round(frac, 2), directionality])

In [None]:
df = pd.read_csv('../../results/directionality_dependency_threshold.csv', header=0)
display(df)

### plot all results

In [None]:
for resolution in resolutions:
    for central in central_groups:
        plt.figure(figsize=(8,4))
        plt.subplots_adjust(bottom=0.2)
        
        plt.grid(True)
        plt.title(f"Dependency of directionality on threshold {central}")

        for contact_rp, contact in zip(contact_rps, contact_groups):
            partdf = df[(df.frac <= 0.4) &(df.res == resolution) & (df.central == central) & (df.contact == contact) & (df.contact_rp == contact_rp)]

            plt.plot(partdf.frac, partdf.directionality)

            plt.scatter(partdf.frac, partdf.directionality, label=contact + "-" + contact_rp)
to
        plt.ylabel("Directionality")

        plt.xlabel("Fraction used to determine threshold")
        plt.legend(loc='upper right')
        plt.show()
        plt.savefig(f"../../results/dependencies/directionality_threshold_{central}_res_{resolution :.2f}.png")    

# Datapoints ~ Clusterfrac

In [None]:
# for different resolutions and cluster fracs
resolutions = [0.1, 0.2, 0.25, 0.3, 0.5]
cluster_fracs = np.arange(0.1, 1, 0.05)

again = True

try:
    if again:
        raise FileNotFoundError
    
    # if already exists and not specifically said that we must do it again, read the datafile
    df = pd.read_csv('../../results/datapoints_dependency_threshold.csv', header=None)
except FileNotFoundError:

    with open('../../results/datapoints_dependency_threshold.csv', 'w', newline="") as resultsfile:
                    csvwriter = csv.writer(resultsfile)
                    csvwriter.writerow(["central", "contact", "contact_rp", "res", "frac", "n", "n_frac"])

    # for each resolution
    for resolution in resolutions:
        
        # calculate for each pair
        for central_group in central_groups:
            for contact_rp, contact_group in zip(contact_rps, contact_groups):
                # loop over the cluster fracs
                for frac in cluster_fracs:

                    n_frac, n = points_in_cluster(contact_rp, contact_group, central_group, cluster_frac, round(resolution, 2))

                    with open('../../results/datapoints_dependency_threshold.csv', 'a', newline="") as resultsfile:
                        csvwriter = csv.writer(resultsfile)
                        csvwriter.writerow([central_group, contact_group, contact_rp, round(resolution, 2), round(cluster_frac, 2), n, n_frac])

In [None]:
df = pd.read_csv('../../results/datapoints_dependency_threshold.csv', header=0)
display(df)

### plot all results

In [None]:
for resolution in resolutions:
    for central in central_groups:
        plt.figure(figsize=(8,4))
        plt.grid(True)
        plt.title(f"Dependency of fraction datapoints on threshold {central}\nResolution: {resolution}")

        for contact_rp, contact in zip(contact_rps, contact_groups):
            partdf = df[(df.res == resolution) & (df.central == central) & (df.contact == contact) & (df.contact_rp == contact_rp)]

            plt.plot(partdf.frac, partdf.n_frac)
            plt.scatter(partdf.frac, partdf.n_frac, label=contact + "-" + contact_rp)
        
        plt.subplots_adjust(bottom=0.2)
        plt.ylim(0,1.05)
        plt.ylabel("Fraction of datapoints")
        plt.xlabel("Fraction used to determine threshold")
        plt.legend(loc='upper right')
        plt.show()   
        plt.savefig(f"../../results/dependencies/datapoints_threshold_{central}_res{resolution :.2f}.png")

# Conclusion
These figures need to be studied very good to be able to pick the right resolution and threshold. See thesis and/or paper for the reasoning.