In [None]:
%matplotlib notebook

# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2


import os
import time
import numpy as np
import csv
import pandas as pd
import copy
from sklearn.cluster import KMeans

from tqdm import tqdm
import time
import math

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from classes.Settings import Settings
from helpers.geometry_helpers import make_coordinate_df
from calc_density_4 import count_points_per_square

from helpers.plot_functions import plot_density, plot_fragment_colored, plot_vdw_spheres
from helpers.density_helpers import prepare_df, find_available_volume
from helpers.geometry_helpers import (make_coordinate_df,
                                      get_vdw_distance_contact)


central_groups = ["RCOMe", "RNO2", "ArCI", "NO3", "RC6F5", "H2O", "RC6H5"]
contact_groups = ["CF", "RCN", "R2CO", "XH", "XH", "CCH3", "C2CH2", "RC6H5", "ArCH"]  #
to_count =       ["F",   "N",    "O",   "H", "O",  "H",     "H", "centroid", "H"] #, 



In [None]:
def make_density_plot(avg_fragment, density_df, settings):
    fig = plt.figure()
    
    ax: Axes3D = fig.add_subplot(111, projection='3d')
    title_string = f"{settings.central_group_name}-{settings.contact_group_name} resolution: {settings.resolution: .2f}\
                     Interacting part of contact group: {to_count_contact}"
    ax.set_title(title_string.replace("", " "))
    
    ax = plot_fragment_colored(ax, avg_fragment)

    p, ax = plot_density(ax=ax, df=density_df, settings=settings)

    ax.set_title("4D density plot\n Resolution: " + str(settings.resolution))
    
    fig.colorbar(p)
    plt.savefig('results/directionality_tests/' + settings.central_group_name  + "/" + settings.central_group_name + "_" + settings.contact_group_name + "_1_resolution_" + str(settings.resolution) + '.png')
#     plt.close()

In [None]:
def calc_clusters(density_df, amount_of_clusters):
    to_cluster_df = density_df[density_df['to_cluster']]
    
    X = np.transpose(np.array([to_cluster_df.x_center, to_cluster_df.y_center, to_cluster_df.z_center]))

    kmeans = KMeans(n_clusters=amount_of_clusters, random_state=1)
    kmeans.fit(X)
    
    density_df.loc[density_df['to_cluster'], "cluster"] = kmeans.labels_
    
    return density_df, kmeans.cluster_centers_

In [None]:
def find_bins_to_cluster(settings, density_df, fraction):    
    density_df["x_center"] = density_df.xstart + 0.5 * settings.resolution
    density_df["y_center"] = density_df.ystart + 0.5 * settings.resolution
    density_df["z_center"] = density_df.zstart + 0.5 * settings.resolution
    
    # normalize
    density_df.loc[:, settings.to_count_contact + "_normalized"] =\
        density_df[settings.to_count_contact] / density_df[settings.to_count_contact].sum()

    # reset cluster color for when you run this cell again
    density_df["cluster_color"] = np.nan
    
    # set a threshold as to determine will belong to a cluster and which ones won't
    # get the upper kwartant
    max_bin = density_df[settings.to_count_contact].max()
    threshold = max_bin * fraction
    print("Threshold k-means:", threshold, "max_bin:", max_bin, "with fraction:", fraction)
  
    density_df["cluster"] = np.nan
    density_df["to_cluster"] = False
    density_df.loc[density_df[settings.to_count_contact] > threshold, 'to_cluster'] = True
    
    return density_df

In [None]:
def my_reduce(lst):
    if len(lst) == 1:
        return lst[0]
    elif len(lst) == 0:
        return []
    
    reduced_list = lst[0]
    for x in lst[1:]:
        reduced_list = reduced_list.append(x)

    reduced_list = reduced_list.drop_duplicates(keep='first')
    return reduced_list

    
def fill_holes(df, radius, column):
    amount = settings.to_count_contact
    unique_clusters = df.cluster.dropna().unique()
    
    for cluster_id in unique_clusters:
        # get all bins in the cluster

        cluster_indices = df.index[(df[column] == cluster_id)]   
        boundary_indices = []
    
        for _, row in df.loc[cluster_indices, :].iterrows():

            # get empty bins around cluster bins
            boundary_index = df.index[((df[column].isna()) &
                                         (np.sqrt((df.x_center - row.x_center)**2 +
                                            (df.y_center - row.y_center)**2 +
                                            (df.z_center - row.z_center)**2) <= radius))]
            
            boundary_indices.append(boundary_index)
        
        boundary_index = my_reduce(boundary_indices)
        
        
        while len(boundary_index) > 0:
            new_cluster_bins = []
            # count bins around these bins
            for i, emptyrow in df.loc[boundary_index, :].iterrows():
                index = df.index[((df[column] == cluster_id) & 
                                   (np.sqrt((df.x_center - emptyrow.x_center)**2 +
                                            (df.y_center - emptyrow.y_center)**2 +
                                            (df.z_center - emptyrow.z_center)**2) <= radius))]

                # if 3 or more neighbors are in the cluster, append
                if len(index) >= 3:
#                     print('Found a hole!')
                    df.loc[i, column] = cluster_id
                    
                    new_cluster_bins.append(i)
                                
            print('Added another:', len(new_cluster_bins))
            
            boundary_indices = []
            # check around the newly filled holes for boundary bins
            for _, row in df.loc[new_cluster_bins, :].iterrows():

                # get empty bins around cluster bins
                boundary_index = df.index[((df[column].isna()) &
                                             (np.sqrt((df.x_center - row.x_center)**2 +
                                            (df.y_center - row.y_center)**2 +
                                            (df.z_center - row.z_center)**2) <= radius))]

                boundary_indices.append(boundary_index)
        
            boundary_index = my_reduce(boundary_indices)
            print('Dit is de lijst van alex:', len(boundary_index))
                                    
    return df

In [None]:
def expand_clusters(df, new_indices, fraction, cluster_id, column_name, settings):
    # always look only 1 bin further
    radius = settings.resolution
    
    # while new bins that belong to the cluster are found, keep expanding
    while new_indices:
        old_length = len(df[df[column_name] == cluster_id])

        indices = []

        for new_index in new_indices:

            for _, row in df.loc[new_index, :].iterrows():
                index = df.index[((df[column_name].isna()) &
                                 (df.x_center >= row.x_center - radius) & (df.x_center <= row.x_center + radius) & 
                                 (df.y_center >= row.y_center - radius) & (df.y_center <= row.y_center + radius) &
                                 (df.z_center >= row.z_center - radius) & (df.z_center <= row.z_center + radius) &
                                 (df[settings.to_count_contact] > fraction))]

                indices.append(index)

                df.loc[index, column_name] = cluster_id

        new_indices = indices
        
        print(f"Added {len(indices)} new bins")
        
    return df

In [None]:
def recluster0(df, settings, recluster_frac):
    amount = settings.to_count_contact
    
    unique_clusters = df.cluster.dropna().unique()

    df["new_cluster0"] = np.nan
    
    for cluster_id in unique_clusters:
        
        # local maximum
        max_bin = df[df.cluster == cluster_id][amount].max()
        
        fraction = max_bin * recluster_frac
        print("Cluster id: " + str(cluster_id), " max bin: " + str(max_bin) + " fraction for this bin: ", fraction)
        
        new_indices = []
        # starting bin is only fullest bin(s)
        index = df.index[((df.cluster == cluster_id) & (df[amount] == max_bin))]
        df.loc[index, "new_cluster0"] = cluster_id      
        
        new_indices.append(index)
                
        # expand clusters
        df = expand_clusters(df, new_indices, fraction, cluster_id, 'new_cluster0', settings)
        
    return df

In [None]:
def recluster1(df, settings, recluster_frac):
    amount = settings.to_count_contact
    
    unique_clusters = df.cluster.dropna().unique()

    df["new_cluster1"] = np.nan
    
    for cluster_id in unique_clusters:
        
        # local maximum
        max_bin = df[df.cluster == cluster_id][amount].max()      
        fraction = max_bin * recluster_frac
        
        print("Cluster id: " + str(cluster_id), " max bin: " + str(max_bin) + " fraction for this bin: ", fraction)
        
        new_indices = []
        # put bins from K-means that are fuller then LOCAL threshold 
        index = df.index[((df.cluster == cluster_id) & (df[amount] >= fraction))]
        df.loc[index, "new_cluster1"] = cluster_id      
        
        new_indices.append(index)
                                    
        # expand clusters
        df = expand_clusters(df, new_indices, fraction, cluster_id, 'new_cluster1', settings)
        
    return df

In [None]:
def recluster2(df, settings, recluster_frac):
    amount = settings.to_count_contact
    
    unique_clusters = df.cluster.dropna().unique()
    
    # global maximum
    global_max = df[amount].max()
    fraction = global_max * recluster_frac

    df["new_cluster2"] = np.nan
    
    for cluster_id in unique_clusters:
        
        max_bin = df[df.cluster == cluster_id][amount].max()
        print("Cluster id: " + str(cluster_id), " max bin: " + str(max_bin) + " fraction for this bin: ", fraction)
        
        new_indices = []
        # start only with fullest bin per cluster
        index = df.index[((df.cluster == cluster_id) & (df[amount] == max_bin))]
        df.loc[index, "new_cluster2"] = cluster_id      
        
        new_indices.append(index)
                    
        # expand clusters
        df = expand_clusters(df, new_indices, fraction, cluster_id, 'new_cluster2', settings)
            
    return df

In [None]:
def recluster3(df, settings, recluster_frac):
    amount = settings.to_count_contact
    
    unique_clusters = df.cluster.dropna().unique()
    
    # global maximum
    global_max = df[amount].max()
    fraction = global_max * recluster_frac

    df["new_cluster3"] = np.nan
    
    for cluster_id in unique_clusters:
        
        max_bin = df[df.cluster == cluster_id][amount].max()
        print("Cluster id: " + str(cluster_id), " max bin: " + str(max_bin) + " fraction for this bin: ", fraction)
        
        new_indices = []
        # start with points from KMEANS that are fuller then the TRESHOLD
        index = df.index[((df.cluster == cluster_id) & (df[amount] >= fraction))]
        df.loc[index, "new_cluster3"] = cluster_id      
        
        new_indices.append(index)
                    
        # expand clusters
        df = expand_clusters(df, new_indices, fraction, cluster_id, 'new_cluster3', settings)
            
    return df

In [None]:
def calc_directionality(settings, density_df, k, kmeans_frac, recluster_frac):
    cluster_count = 0

    # work only with bins that are >0.25*maximum full
    density_df = find_bins_to_cluster(settings=settings, density_df=density_df, fraction=kmeans_frac)

    # calc for each bin in what cluster it belongs
    density_df, centroids = calc_clusters(density_df, k)
    density_df.drop(columns=["to_cluster"])
    
    # find the volume of the central group
    tolerance = 0.5

    df = pd.read_csv(settings.get_kabsch_aligned_csv_filename())
    avg_fragment = pd.read_csv(settings.get_avg_frag_filename())

    coordinate_df = make_coordinate_df(df, settings, avg_fragment)
    
    contact_group_radius = get_vdw_distance_contact(df, settings)
    available_volume = find_available_volume(avg_fragment=avg_fragment, extra=(tolerance + contact_group_radius))
    
#     print("\nRecluster method 0")
#     density_df = recluster0(density_df, settings, recluster_frac)

#     print("\nRecluster method 1")
#     density_df = recluster1(density_df, settings, recluster_frac)

#     print("\nRecluster method 2")
#     density_df = recluster2(density_df, settings, recluster_frac)
    
    print("\nRecluster method 3")
    density_df = recluster3(density_df, settings, recluster_frac)
        
    return density_df, centroids, available_volume

In [None]:
def show_directionality(avg_fragment, df, column, settings, available_volume, plt_name, kfrac):
    colors = ["red", "green", "blue", "purple", "yellow", "pink", "orange", "grey"]
    
    df["cluster_color"] = "grey"

    df.loc[df[column].notna(), "cluster_color"] = [colors[int(i)] for i in list(df.loc[df[column].notna(), column])]
    df = df[df[settings.to_count_contact + "_normalized"] > 0]

    fig = plt.figure(figsize=(8,8))
    ax: Axes3D = fig.add_subplot(111, projection='3d')

    rest = df[df.cluster_color != "grey"]
    
    clusters = df[column].dropna().unique()
    print(clusters)
    k = len(clusters)
    
    txt = "Available volume: + " + str(available_volume) + "\nClusters: " + str(k) + "\n"
    for cluster_id in clusters:
        points = df[df[column] == cluster_id]
        firstpoint = points.iloc[0]
        volume = len(points) * settings.resolution**3
        fraction = points[settings.to_count_contact + "_normalized"].sum()
        directionality = fraction/volume * available_volume
        
        with open('results/single_directionality.csv', 'a', newline='') as results:
            writer = csv.writer(results)
            writer.writerow([settings.central_group_name, settings.contact_group_name, settings.to_count_contact, 
                             settings.resolution, kfrac, column, directionality])
        
        ax.scatter(firstpoint.x_center, firstpoint.y_center, firstpoint.z_center,\
                    label="Cluster: " + str(cluster_id) + " Directionality:" + str(round(directionality, 2)),\
                    color=firstpoint.cluster_color)
        
        txt += 'id: ' + str(cluster_id) + ' vol: ' + str(round(volume,2)) + ' frac: ' + str(round(fraction,2)) + "\n"
            
    fig.text(.05,.05,txt)
    
    ax.scatter(list(rest.x_center), list(rest.y_center), list(rest.z_center),
               color=list(rest.cluster_color))
    
    # plot the average fragment
    ax = plot_fragment_colored(ax, avg_fragment)

    ax.set_title("Clusters " + settings.central_group_name + "-" + settings.contact_group_name + ", resolution: " + str(settings.resolution) + "\n" + column)
    
    ax.set_xlim(-6, 6)
    ax.set_ylim(-6, 6)
    
    ax.set_xlabel("X coordinate")
    ax.set_ylabel("Y coordinate")
    
    ax.legend(fontsize='x-small')
    
    elev = 89
    azim = -89
    ax.view_init(elev=elev, azim=azim)
    
    plt.show()
    
    plt.savefig(plt_name)
#     plt.close()

In [None]:
# for resolution 0.25
cluster_amount= 1

In [None]:
def get_plot_name(settings, column, cluster_amount, filledholes=False):
    k = cluster_amount
    if filledholes:
        string = f"results/single_cluster_directionality/{settings.central_group_name}/{settings.central_group_name}_\
                {settings.contact_group_name}_{settings.to_count_contact}_k{k}_resolution_{settings.resolution: .2f}\
                _{column}kfrac_{kfrac: .2f}_refrac_{recluster_frac: .2f}_filledholes.png"
    else:
        string = f"results/single_cluster_directionality/{settings.central_group_name}/{settings.central_group_name}_\
                  {settings.contact_group_name}_{settings.to_count_contact}_k{k}_resolution_{settings.resolution: .2f}\
                  _{column}kfrac_{kfrac: .2f}_refrac_{recluster_frac: .2f}.png"
            
    return string.replace(" ", "")

def get_cluster_plot_name(settings, column, cluster_amount):
    k = cluster_amount

    string = f'results/single_cluster_directionality/{settings.central_group_name}/{settings.central_group_name}_\
             {settings.contact_group_name}_{settings.to_count_contact}_k{k}_resolution_{settings.resolution: .2f}\
             _{column}_kfrac{kfrac: .2f}.png'
    
    return string.replace(" ", "")

In [None]:
central_groups = ["RCOMe", "RNO2", "ArCI", "NO3", "RC6F5", "H2O", "RC6H5"]
contact_groups = ["CF", "RCN", "R2CO", "XH", "XH", "CCH3", "C2CH2", "RC6H5", "ArCH"]  #
to_count =       ["F",   "N",    "O",   "H", "O",  "H",     "H", "centroid", "H"] #, 

if not os.path.exists('results/single_cluster_directionality/'):
    os.mkdir('results/single_cluster_directionality/')

kfrac = 0.25
recluster_frac = 0.25

resolution = 0.50

for central in central_groups:
    for contact, to_count_contact in zip(contact_groups, to_count):
        if not os.path.exists('results/single_cluster_directionality/' + central + "/"):
            os.mkdir('results/single_cluster_directionality/' + central + "/")
        
        print(central, contact)
        if central == "RC6H5" and contact == "RC6H5":
            break
            
        filename = ".\\results\\" + central + "\\" + central + "_" + contact + "_vdw.5" + "\\" + central + "_" + contact + "_vdw.5_aligned.csv"
        settings = Settings(filename)
        settings.set_atom_to_count(to_count_contact)
        settings.set_resolution(round(resolution, 2))
        
        avg_fragment = pd.read_csv(settings.get_avg_frag_filename())
        density_df = pd.read_hdf(settings.get_density_df_filename(), settings.get_density_df_key())
        print(settings.get_density_df_filename(), settings.get_density_df_key())
        
        maximum = density_df[to_count_contact].max()
        print(density_df[density_df[to_count_contact] == maximum])
         
#         make_density_plot(avg_fragment, density_df, settings)

        density_df, centroids, V_available = calc_directionality(settings=settings,
                                                                  density_df=density_df,
                                                                  k=cluster_amount, 
                                                                  kmeans_frac=kfrac,
                                                                  recluster_frac=recluster_frac)

        print("Available volume:", V_available)
        
        # cluster
        plt_name = get_cluster_plot_name(settings, 'cluster', cluster_amount)
        show_directionality(avg_fragment, density_df, 'cluster', settings, V_available, plt_name, kfrac)
                
        # recluster in method 3 and fill the holes
        plt_name = get_plot_name(settings, 'new_cluster3', cluster_amount)       
        density_df_filled_cluster = fill_holes(copy.deepcopy(density_df), settings.resolution, column='new_cluster3')
        
        plt_name = get_plot_name(settings, 'new_cluster3', cluster_amount, filledholes=True)
        show_directionality(avg_fragment, density_df_filled_cluster, 'new_cluster3', settings, V_available, plt_name, kfrac)