In [None]:
%matplotlib notebook

# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2


import os
import time
import numpy as np
import csv
import pandas as pd
import copy
from sklearn.cluster import KMeans

from tqdm import tqdm
import time
import math

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from classes.Settings import Settings
from helpers.geometry_helpers import make_coordinate_df
from calc_density_4 import count_points_per_square

from helpers.plot_functions import plot_density, plot_fragment_colored, plot_vdw_spheres
from helpers.density_helpers import prepare_df, find_available_volume
from helpers.geometry_helpers import (make_coordinate_df,
                                      get_vdw_distance_contact)


central_groups = ["H2O", "ArCI", "NO3", "RC6F5", "RNO2", "RCOMe", "RC6H5"] #
contact_groups = ["CF", "RCN", "R2CO", "XH", "CCH3", "C2CH2", "RC6H5", "ArCH"] #  
to_count =       ["F",   "N",    "O",   "H",   "H",     "H", "centroid", "H"]

central_groups = ["RC6H5"]

resolutions = np.arange(0.1, 1.1, 0.1)

In [None]:
def make_density_plot(avg_fragment, density_df, settings):
    fig = plt.figure()
    
    ax: Axes3D = fig.add_subplot(111, projection='3d')
    ax.set_title(settings.central_group_name + "-" + settings.contact_group_name + " resolution: " + str(settings.resolution))
    ax = plot_fragment_colored(ax, avg_fragment)

    p, ax = plot_density(ax=ax, df=density_df, settings=settings)

    ax.set_title("4D density plot\n Resolution: " + str(settings.resolution))
    
    fig.colorbar(p)
    plt.savefig('results/directionality_tests/' + settings.central_group_name  + "/" + settings.central_group_name + "_" + settings.contact_group_name + "_1_resolution_" + str(settings.resolution) + '.png')
#     plt.close()

In [None]:
def calc_clusters(density_df, amount_of_clusters):
    to_cluster_df = density_df[density_df['to_cluster']]
    
    X = np.transpose(np.array([to_cluster_df.x_center, to_cluster_df.y_center, to_cluster_df.z_center]))

    kmeans = KMeans(n_clusters=amount_of_clusters, random_state=1)
    kmeans.fit(X)
    
    density_df.loc[density_df['to_cluster'], "cluster"] = kmeans.labels_
    
    return density_df, kmeans.cluster_centers_

In [None]:
def find_bins_to_cluster(settings, fraction):
    
    density_df = pd.read_hdf(settings.get_density_df_filename(), settings.get_density_df_key())
    print(settings.get_density_df_filename(), settings.get_density_df_key())
    
    density_df["x_center"] = density_df.xstart + 0.5 * settings.resolution
    density_df["y_center"] = density_df.ystart + 0.5 * settings.resolution
    density_df["z_center"] = density_df.zstart + 0.5 * settings.resolution
    
    # normalize
    density_df.loc[:, settings.to_count_contact + "_normalized"] =\
        density_df[settings.to_count_contact] / density_df[settings.to_count_contact].sum()

    # reset cluster color for when you run this cell again
    density_df["cluster_color"] = np.nan
    
    # set a threshold as to determine will belong to a cluster and which ones won't
    # get the upper kwartant
    max_bin = density_df[settings.to_count_contact].max()
    threshold = max_bin * fraction
    print("Threshold k-means:", threshold, "max_bin:", max_bin, "with fraction:", fraction)
  
    density_df["cluster"] = np.nan
    density_df["to_cluster"] = False
    density_df.loc[density_df[settings.to_count_contact] > threshold, 'to_cluster'] = True
    
    return density_df

In [None]:
def recluster(df, settings, recluster_frac):
    amount = settings.to_count_contact
    
    unique_clusters = df.cluster.dropna().unique()

    df["new_cluster"] = np.nan
    
    fullest_bin = []
    for cluster_id in unique_clusters:
        
        max_bin = df[df.cluster == cluster_id][amount].max()
        
        fraction = max_bin * recluster_frac
        print("Cluster id: " + str(cluster_id), " max bin: " + str(max_bin) + " fraction for this bin: ", fraction)
        
        # put first bin in cluster
        df.loc[((df.cluster == cluster_id) & (df[amount] == max_bin)), 'new_cluster'] = cluster_id
            
        # add bins in "x" radius around the local maximums
        new_found = True
        
        # search 1 bins further in all directions
        radius = settings.resolution
        
        # while new bins that belong to the cluster are found, keep expanding
        while new_found:
            old_length = len(df[df.new_cluster == cluster_id])
            
            for _, row in df[df.new_cluster == cluster_id].iterrows():
                df.loc[(df.x_center >= row.x_center - radius) & (df.x_center <= row.x_center + radius) & 
                                (df.y_center >= row.y_center - radius) & (df.y_center <= row.y_center + radius) &
                                (df.z_center >= row.z_center - radius) & (df.z_center <= row.z_center + radius) &
                                (df[amount] > fraction) & (df.new_cluster.isna()), 'new_cluster'] = cluster_id
                
                
            new_length = len(df[df.new_cluster == cluster_id])
            print('Old Length, New Length', old_length, new_length)
            
            if new_length <= old_length:
                new_found = False
        
    return df


def recluster1(df, settings, recluster_frac):
    amount = settings.to_count_contact
    
    unique_clusters = df.cluster.dropna().unique()

    df["new_cluster1"] = np.nan
    
    fullest_bin = []
    for cluster_id in unique_clusters:
        
        max_bin = df[df.cluster == cluster_id][amount].max()
        
        fraction = max_bin * recluster_frac
        print("Cluster id: " + str(cluster_id), " max bin: " + str(max_bin) + " fraction for this bin: ", fraction)
        
        new_indices = []
        # put first bin in cluster
        index = df.index[((df.cluster == cluster_id) & (df[amount] == max_bin))]
        df.loc[index, "new_cluster1"] = cluster_id      
        
        new_indices.append(index)
        
        # search 1 bins further in all directions
        radius = settings.resolution
        
        # while new bins that belong to the cluster are found, keep expanding
        while new_indices:
            old_length = len(df[df.new_cluster1 == cluster_id])
            
            indices = []
            
            for new_index in new_indices:
                                
                for _, row in df.loc[new_index, :].iterrows():
                    index = df.index[((df.new_cluster1.isna()) &
                                     (df.x_center >= row.x_center - radius) & (df.x_center <= row.x_center + radius) & 
                                     (df.y_center >= row.y_center - radius) & (df.y_center <= row.y_center + radius) &
                                     (df.z_center >= row.z_center - radius) & (df.z_center <= row.z_center + radius) &
                                     (df[amount] > fraction))]

                    indices.append(index)
                    
                    df.loc[index, 'new_cluster1'] = cluster_id
            
            new_indices = indices
                
            new_length = len(df[df.new_cluster1 == cluster_id])
            print('Old Length, New Length', old_length, new_length)
            
            if new_length <= old_length:
                new_found = False
        
    return df



def recluster2(df, settings, recluster_frac):
    amount = settings.to_count_contact
    
    unique_clusters = df.cluster.dropna().unique()
    
    global_max = df[amount].max()
    fraction = global_max * recluster_frac

    df["new_cluster2"] = np.nan
    
    fullest_bin = []
    for cluster_id in unique_clusters:
        
        max_bin = df[df.cluster == cluster_id][amount].max()
        print("Cluster id: " + str(cluster_id), " max bin: " + str(max_bin) + " fraction for this bin: ", fraction)
        
        new_indices = []
        # put first bin in cluster
        index = df.index[((df.cluster == cluster_id) & (df[amount] == max_bin))]
        df.loc[index, "new_cluster2"] = cluster_id      
        
        new_indices.append(index)
                    
        # search 1 bins further in all directions
        radius = settings.resolution
        
        # while new bins that belong to the cluster are found, keep expanding
        while new_indices:
            old_length = len(df[df.new_cluster2 == cluster_id])
            
            indices = []
            
            for new_index in new_indices:
                                
                for _, row in df.loc[new_index, :].iterrows():
                    index = df.index[((df.new_cluster2.isna()) &
                                     (df.x_center >= row.x_center - radius) & (df.x_center <= row.x_center + radius) & 
                                     (df.y_center >= row.y_center - radius) & (df.y_center <= row.y_center + radius) &
                                     (df.z_center >= row.z_center - radius) & (df.z_center <= row.z_center + radius) &
                                     (df[amount] > fraction))]

                    indices.append(index)
                    
                    df.loc[index, 'new_cluster2'] = cluster_id
            
            new_indices = indices
                                     
            new_length = len(df[df.new_cluster2 == cluster_id])
            print('Old Length, New Length', old_length, new_length)
            
            if new_length <= old_length:
                new_found = False
        
    return df

In [None]:
def calc_directionality(settings, k, kmeans_frac, recluster_frac):
    cluster_count = 0

    # work only with bins that are >0.25*maximum full
    density_df = find_bins_to_cluster(settings=settings, fraction=kmeans_frac)

    # calc for each bin in what cluster it belongs
    density_df, centroids = calc_clusters(density_df, k)
    density_df.drop(columns=["to_cluster"])
    
    # find the volume of the central group
    tolerance = 0.5

    df = pd.read_csv(settings.get_kabsch_aligned_csv_filename())
    avg_fragment = pd.read_csv(settings.get_avg_frag_filename())

    coordinate_df = make_coordinate_df(df, settings, avg_fragment)
    contact_radius = coordinate_df['longest_vdw'].mean()

    available_volume = find_available_volume(avg_fragment=avg_fragment, extra=(tolerance + contact_radius))
    
    print("\nRecluster method 0")
    density_df = recluster(density_df, settings, recluster_frac)

    print("\nRecluster method 1")
    density_df = recluster1(density_df, settings, recluster_frac)

    print("\nRecluster method 2")
    density_df = recluster2(density_df, settings, recluster_frac)
        
    return density_df, centroids, available_volume

In [None]:
def show_directionality(avg_fragment, df, column, settings, available_volume, kfrac, recluster_frac):
    colors = ["red", "green", "blue", "purple", "yellow", "pink", "orange", "grey"]
    
    df["cluster_color"] = "grey"

    df.loc[df[column].notna(), "cluster_color"] = [colors[int(i)] for i in list(df.loc[df[column].notna(), column])]
    df = df[df[settings.to_count_contact + "_normalized"] > 0]

    fig = plt.figure(figsize=(8,8))
    ax: Axes3D = fig.add_subplot(111, projection='3d')

    rest = df[df.cluster_color != "grey"]
    
    clusters = df[column].dropna().unique()
    k = len(clusters)
    
    for cluster_id in clusters:
        points = df[df[column] == cluster_id]
        firstpoint = points.iloc[0]
        volume = len(points) * settings.resolution**3
        fraction = points[settings.to_count_contact + "_normalized"].sum()
        directionality = fraction/volume * available_volume
        
        ax.scatter(firstpoint.x_center, firstpoint.y_center, firstpoint.z_center,\
                    label="Cluster: " + str(cluster_id) + " Directionality:" + str(round(directionality, 2)),\
                    color=firstpoint.cluster_color)
        
    txt = 'test'
    
    fig.text(.5,.05,txt)
    
    ax.scatter(list(rest.x_center), list(rest.y_center), list(rest.z_center),
               color=list(rest.cluster_color))
    
    # plot the average fragment
    ax = plot_fragment_colored(ax, avg_fragment)

    ax.set_title("Clusters " + settings.central_group_name + "-" + settings.contact_group_name + ", resolution: " + str(settings.resolution) + "\n" + column)
    
    ax.set_xlim(-6, 6)
    ax.set_ylim(-6, 6)
    
    ax.set_xlabel("X coordinate")
    ax.set_ylabel("Y coordinate")
    
    ax.legend(fontsize='x-small')
    
    elev = 89
    azim = -89
    ax.view_init(elev=elev, azim=azim)
    
    plt.show()
    
#     plt.savefig('results/directionality_tests/' + settings.central_group_name + "/" + settings.central_group_name +\
#                 "_" + settings.contact_group_name + "_k_" + str(k) + "_resolution_" + str(settings.resolution) +\
#                 "_"  + column + "_kfrac_" + str(kfrac) + "_refrac_" + str(recluster_frac) + ".png")
#     plt.close()

In [None]:
cluster_amounts = {"ArCI": {"CF": 8, "RCN": 1, "R2CO": 1, "XH": 2, "CCH3": 1, "C2CH2": 1, "RC6H5": 1, "ArCH": 1},
                   "H2O": {"CF": 3, "RCN": 2, "R2CO": 2, "XH": 3, "CCH3": 1, "C2CH2": 1, "RC6H5": 3, "ArCH": 1},
                   "NO3": {"CF": 1, "RCN": 1, "R2CO": 1, "XH": 3, "CCH3": 1, "C2CH2": 1, "RC6H5": 1, "ArCH": 1},
                   "RC6F5": {"CF": 5, "RCN": 1, "R2CO": 7, "XH": 2, "CCH3": 1, "C2CH2": 1, "RC6H5": 3, "ArCH": 1},
                   "RC6H5": {"CF": 4, "RCN": 4, "R2CO": 1, "XH": 1, "CCH3": 1, "C2CH2": 1, "RC6H5": 0, "ArCH": 0},
                   "RNO2": {"CF": 1, "RCN": 1, "R2CO": 1, "XH": 3, "CCH3": 1, "C2CH2": 1, "RC6H5": 1, "ArCH": 1},
                   "RCOMe": {"CF": 1, "RCN": 1, "R2CO": 1, "XH": 1, "CCH3": 1, "C2CH2": 1, "RC6H5": 1, "ArCH": 1}}

In [None]:
if not os.path.exists('results/directionality_tests/'):
    os.mkdir('results/directionality_tests/')

kfrac = 0.25
recluster_frac = 0.25

for central in central_groups:
    for contact, to_count_contact in zip(contact_groups, to_count):
        if not os.path.exists('results/directionality_tests/' + central + "/"):
            os.mkdir('results/directionality_tests/' + central + "/")
        
        print(central, contact)
        if central == "RC6H5" and contact == "RC6H5":
            break
            
        filename = ".\\results\\" + central + "\\" + central + "_" + contact + "_vdw.5" + "\\" + central + "_" + contact + "_vdw.5_aligned.csv"
        settings = Settings(filename)
        settings.set_atom_to_count(to_count_contact)
        settings.set_resolution(round(0.2, 2))
        
        avg_fragment = pd.read_csv(settings.get_avg_frag_filename())
        density_df = pd.read_hdf(settings.get_density_df_filename(), settings.get_density_df_key())
         
        make_density_plot(avg_fragment, density_df, settings)

        density_df, centroids, V_available = calc_directionality(settings=settings,
                                      k=cluster_amounts[central][contact], 
                                      kmeans_frac=kfrac,
                                      recluster_frac=recluster_frac)

        print("Available volume:", V_available)
        show_directionality(avg_fragment, density_df, 'cluster', settings, V_available, kfrac, recluster_frac)
        
        show_directionality(avg_fragment, density_df, 'new_cluster', settings, V_available, kfrac, recluster_frac)
        show_directionality(avg_fragment, density_df, 'new_cluster1', settings, V_available, kfrac, recluster_frac)
        show_directionality(avg_fragment, density_df, 'new_cluster2', settings, V_available, kfrac, recluster_frac)