In [None]:
import sys

sys.path.append('..//scripts//')

In [None]:
%matplotlib notebook

# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2

import os
import time
import numpy as np
import csv
import pandas as pd

from tqdm import tqdm

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from calc_avg_fragment import calc_avg_fragment, calc_avg_rmse
from constants.paths import CENTRAL_GROUPS_CSV
from classes.Settings import AlignmentSettings
from classes.Radii import Radii

from align_kabsch import align_all_fragments

central_groups = ["H2O", "ArCI", "NO3", "RC6F5", "RNO2", "RCOMe", "REt", "RC6H5"] #
contact_groups = ["CF", "RCN", "R2CO", "XH", "CCH3", "C2CH2", "RC6H5", "ArCH"] #  
to_count =       ["F",   "N",    "O",   "H", "H",     "H", "centroid", "H"]

# Use Kmeans

In [None]:
df_avg_f = pd.DataFrame(index=contact_groups, columns=central_groups)

for central_group in central_groups:
    for to_count_contact, contact_group in zip(to_count, contact_groups):

        print('\n', central_group, contact_group, to_count_contact)

        datafile = "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
        result1 = "..\\..\\results\\pairs\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5\\"\
                  + central_group + "_" + contact_group + "_aligned.csv" 

        if not os.path.exists(f"..\\..\\results\\pairs\\{central_group}"):
            os.mkdir(f"..\\..\\results\\pairs\\{central_group}\\")

        t0_alignment = time.time()

        labelfile = datafile.rsplit('.', 1)[0] + '.csv'

        settings = AlignmentSettings("..\\..", datafile, labelfile)
        settings.set_atom_to_count(to_count_contact)
        settings.set_central_group_csv(CENTRAL_GROUPS_CSV)
        settings.prepare_alignment()
        
        align_all_fragments(settings)

        avg_frag = calc_avg_fragment(settings)
    
        # DO RMSE TEST
        rmse_avg_f = calc_avg_rmse(avg_frag[~avg_frag.label.str.contains("aH")], settings)
        df_avg_f.loc[df_avg_f.index == contact_group, central_group] = rmse_avg_f

# Ignore KMEANs

In [None]:
df_avg_f_before_kmeans = pd.DataFrame(index=contact_groups, columns=central_groups)

for central_group in central_groups:
    for to_count_contact, contact_group in zip(to_count, contact_groups):

        print('\n', central_group, contact_group, to_count_contact)

        datafile = "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
        result1 = "..\\..\\results\\pairs\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5\\"\
                  + central_group + "_" + contact_group + "_aligned.csv" 

        if not os.path.exists(f"..\\..\\results\\pairs\\{central_group}"):
            os.mkdir(f"..\\..\\results\\pairs\\{central_group}\\")

        t0_alignment = time.time()

        labelfile = datafile.rsplit('.', 1)[0] + '.csv'

        settings = AlignmentSettings("..\\..", datafile, labelfile)
        settings.set_atom_to_count(to_count_contact)
        settings.set_central_group_csv(CENTRAL_GROUPS_CSV)
        settings.prepare_alignment()
        
        df = pd.read_csv(settings.get_aligned_csv_filename())
        central_group_df = df[df.label != "-"]
        
        central_group_df = central_group_df.sort_values(['fragment_id', 'label'])
        
        avg_frag = central_group_df.groupby('label', sort=False).agg({'symbol': 'first',
                                                                         'x': 'mean',
                                                                         'y': 'mean',
                                                                         'z': 'mean'}).reset_index()
    
        rmse_avg_f = calc_avg_rmse(avg_frag[~avg_frag.label.str.contains("aH")], settings, central_group_df)

        df_avg_f_before_kmeans.loc[df_avg_f_before_kmeans.index == contact_group, central_group] = rmse_avg_f

In [None]:
# function for set text color of positive 
# values in Dataframes 
def color_positive_green(val): 
    """ 
    Takes a scalar and returns a string with 
    the css property `'color: green'` for positive 
    strings, black otherwise. 
    """
    color = "black"
    if val > 0.45: 
        color = 'red'
    elif val > 0.1: 
        color = 'orange'
        
    return 'color: %s' % color 

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df_avg_f_before_kmeans_styled = df_avg_f_before_kmeans.style.applymap(color_positive_green)

display(df_avg_f_before_kmeans_styled)

print(df_avg_f_before_kmeans.to_latex())

In [None]:
df_avg_f_styled = df_avg_f.style.applymap(color_positive_green)
display(df_avg_f_styled)

print(df_avg_f.to_latex())

# Also compare to Kabsch

In [None]:
df_rmse_kabsch = pd.DataFrame(index=contact_groups, columns=central_groups)

for central_group in central_groups:
    for to_count_contact, contact_group in zip(to_count, contact_groups):

        print('\n', central_group, contact_group, to_count_contact)

        datafile = "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
        result1 = "..\\..\\results\\pairs\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5\\"\
                  + central_group + "_" + contact_group + "_aligned.csv" 

        if not os.path.exists(f"..\\..\\results\\pairs\\{central_group}"):
            os.mkdir(f"..\\..\\results\\pairs\\{central_group}\\")

        t0_alignment = time.time()

        labelfile = datafile.rsplit('.', 1)[0] + '.csv'

        settings = AlignmentSettings("..\\..", datafile, labelfile)
        settings.set_atom_to_count(to_count_contact)
        settings.set_central_group_csv(CENTRAL_GROUPS_CSV)
        settings.prepare_alignment()
        
        df = pd.read_csv(settings.get_structure_csv_filename())
        rmse_kabsch = df.rmse.mean()
        df_rmse_kabsch.loc[df_rmse_kabsch.index == contact_group, central_group] = rmse_kabsch

In [None]:
df_rmse_kabsch_styled = df_rmse_kabsch.style.applymap(color_positive_green)
display(df_rmse_kabsch_styled)

print(df_rmse_kabsch.to_latex())

# conclusions
kabsch is ok

kmeans is good backup plan