In [1]:
%matplotlib notebook

# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2

import os
import time
import numpy as np
import csv
import pandas as pd

from tqdm import tqdm

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from classes.Settings import Settings

from helpers.alignment_helpers import (alignment_dict, calc_rmse, kabsch_align, perform_rotations,
                                       perform_translation, read_coord_file, read_raw_data)
from align_kabsch import prepare_data, do_kabsch_align

from helpers.geometry_helpers import make_coordinate_df, average_fragment, add_model_methyl
from helpers.density_helpers import prepare_df

central_groups = ["RCOMe", "RNO2", "ArCI", "NO3", "RC6F5", "H2O", "RC6H5"]
contact_groups = ["CF", "RCN", "R2CO", "XH", "XH", "CCH3", "C2CH2", "RC6H5", "ArCH"]  #
to_count =       ["F",   "N",    "O",   "H", "O",  "H",     "H", "centroid", "H"] #, 


central_groups = ["ArCI"]

# Count the structures

In [2]:
count = False
counts = []

if count:
    with open('results/amounts_structures.csv', 'a', newline='') as resultsfile:
        writer = csv.writer(resultsfile)
        writer.writerow(['central', 'contact', 'amount_cif', 'amount_structures'])
        
        for central_group in central_groups:

            for to_count_contact, contact_group in zip(to_count, contact_groups):
                datafile = ".\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"

                ids = []

                with open(datafile, 'r') as resultsFile:
                    line = "hoi"
                    while line:
                        line = resultsFile.readline()
                        if "**" in line:
                            ids.append(line.split("**")[0])

                    print(central_group, contact_group, len(ids), len(set(ids)))
                    counts.append(len(ids))

                    writer.writerow([central_group, contact_group, len(ids), len(set(ids))])

# Data Gathering

In [3]:
run_everything_again = True

if run_everything_again:
    with open('results/redo_coordinate_df.csv', 'a', newline='') as resultsfile:
#         writer = csv.writer(resultsfile)
#         writer.writerow(['central', 'contact', 'to_count', 'coordinate_df'])

        for central_group in central_groups:
            for to_count_contact, contact_group in zip(to_count, contact_groups):
             
                print(central_group, contact_group, to_count_contact)

                datafile = ".\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
                result1 = ".\\results\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5\\"\
                          + central_group + "_" + contact_group + "_kabsch_aligned.csv"

                settings = Settings(datafile)
                settings.set_atom_to_count(to_count_contact)

                t0_alignment = time.time()
                do_kabsch_align(datafile, settings)
                t1_alignment = time.time()
                alignment_time = t1_alignment - t0_alignment

                t0_avg_frag = time.time()
                
                aligned_fragments_df = pd.read_csv(settings.get_kabsch_aligned_csv_filename())
                avg_frag = average_fragment(aligned_fragments_df, settings)

                if central_group == "RCOMe":
                    avg_frag = add_model_methyl(fragment=avg_frag, settings=settings)

                avg_frag.to_csv(settings.get_avg_frag_filename(), index=False)
                t1_avg_frag = time.time()
                avg_fragment_time = t1_avg_frag - t0_avg_frag
                
                t0_coordinate_df = time.time()
                df = aligned_fragments_df[aligned_fragments_df.label == "-"]
            
                coordinate_df = make_coordinate_df(df, settings, avg_frag)
                
                t1_coordinate_df = time.time()
                coordinate_df_time = t1_coordinate_df - t0_coordinate_df
                
                print(central_group, contact_group, to_count_contact, alignment_time, avg_fragment_time, coordinate_df_time)
                
                
#                 writer.writerow([central_group, contact_group, to_count_contact, coordinate_df_time])

ArCI CF F
14 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-']
Pandas is reading csv...


  2%|█▋                                                                           | 155/6992 [00:00<00:04, 1538.32it/s]

Done
Applying Kabsch Algorithm...


100%|████████████████████████████████████████████████████████████████████████████| 6992/6992 [00:04<00:00, 1522.47it/s]


ArCI CF F 7.304219484329224 0.1765282154083252 0.07069087028503418
ArCI RCN N
14 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-']
Pandas is reading csv...


 22%|█████████████████                                                             | 145/664 [00:00<00:00, 1439.02it/s]

Done
Applying Kabsch Algorithm...


100%|██████████████████████████████████████████████████████████████████████████████| 664/664 [00:00<00:00, 1437.96it/s]


ArCI RCN N 0.7868542671203613 0.046915292739868164 0.009979009628295898
ArCI R2CO O
14 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-']
Pandas is reading csv...


  4%|███▏                                                                         | 142/3372 [00:00<00:02, 1409.01it/s]

Done
Applying Kabsch Algorithm...


100%|████████████████████████████████████████████████████████████████████████████| 3372/3372 [00:02<00:00, 1236.84it/s]


ArCI R2CO O 4.130589246749878 0.1356370449066162 0.016994714736938477
ArCI XH H
14 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-']
Pandas is reading csv...


  4%|██▉                                                                          | 134/3498 [00:00<00:02, 1330.29it/s]

Done
Applying Kabsch Algorithm...


100%|████████████████████████████████████████████████████████████████████████████| 3498/3498 [00:02<00:00, 1507.05it/s]


ArCI XH H 3.7618885040283203 0.10373687744140625 0.017939329147338867
ArCI XH O
14 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-']
Pandas is reading csv...


  4%|███▎                                                                         | 150/3498 [00:00<00:02, 1489.91it/s]

Done
Applying Kabsch Algorithm...


100%|████████████████████████████████████████████████████████████████████████████| 3498/3498 [00:02<00:00, 1491.89it/s]


Searching for nearest atom from central group...
Atoms in contact group: 2 atom to count:  O
Coordinate df is made, duration: 1.7864902019500732 s
ArCI XH O 3.7947628498077393 0.1047203540802002 1.7905211448669434
ArCI CCH3 H
17 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-', '-', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-', '-', '-', '-']
Pandas is reading csv...


  4%|███                                                                          | 134/3422 [00:00<00:02, 1329.86it/s]

Done
Applying Kabsch Algorithm...


100%|████████████████████████████████████████████████████████████████████████████| 3422/3422 [00:02<00:00, 1438.61it/s]


ArCI CCH3 H 3.982665777206421 0.12267231941223145 0.018948793411254883
ArCI C2CH2 H
17 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-', '-', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-', '-', '-', '-']
Pandas is reading csv...


 15%|███████████▎                                                                 | 304/2062 [00:00<00:01, 1512.86it/s]

Done
Applying Kabsch Algorithm...


100%|████████████████████████████████████████████████████████████████████████████| 2062/2062 [00:01<00:00, 1466.77it/s]


Searching for nearest atom from central group...
Atoms in contact group: 5 atom to count:  H
Coordinate df is made, duration: 0.22739076614379883 s
ArCI C2CH2 H 2.39520001411438 0.11868119239807129 0.23238039016723633
ArCI RC6H5 centroid
23 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-']
Pandas is reading csv...


  4%|██▊                                                                          | 109/3015 [00:00<00:02, 1082.06it/s]

Done
Applying Kabsch Algorithm...


100%|████████████████████████████████████████████████████████████████████████████| 3015/3015 [00:02<00:00, 1279.33it/s]


ArCI RC6H5 centroid 4.555152654647827 0.17154169082641602 0.01795053482055664
ArCI ArCH H
21 12
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'LAB8', 'LAB12', 'LAB11', 'LAB10', 'LAB9', '-', '-', '-', '-', '-', '-', '-', '-', '-']
{'center': 'LAB1', 'yaxis': 'LAB3', 'xyplane': 'LAB5', 'R': 'LAB8-LAB9-LAB10-LAB11-LAB12', 'LAB8': 'R1', 'LAB9': 'R2', 'LAB10': 'R3', 'LAB11': 'R4', 'LAB12': 'R5'}
['LAB1', 'LAB6', 'LAB4', 'LAB5', 'LAB2', 'LAB3', 'LAB7', 'R1', 'R5', 'R4', 'R3', 'R2', '-', '-', '-', '-', '-', '-', '-', '-', '-']
Pandas is reading csv...


  2%|█▏                                                                           | 125/7942 [00:00<00:06, 1240.88it/s]

Done
Applying Kabsch Algorithm...


100%|████████████████████████████████████████████████████████████████████████████| 7942/7942 [00:06<00:00, 1233.14it/s]


Searching for nearest atom from central group...
Atoms in contact group: 9 atom to count:  H
Coordinate df is made, duration: 0.9680471420288086 s
ArCI ArCH H 11.716521978378296 0.31229138374328613 0.9830121994018555


# Analyzing data

In [None]:
df = pd.read_csv('results/pre_density_comp_time.csv')

df_count = pd.read_csv('results/amounts_structures.csv')
df = pd.merge(df, df_count,  how='left', left_on=['central','contact'], right_on = ['central','contact'])

real_coordinate_time = pd.read_csv('results/redo_coordinate_df.csv')

real_coordinate_time.columns = ['central', 'contact', 'to_count', 'coordinate_df_real']
df = pd.merge(df, real_coordinate_time,  how='left', left_on=['central','contact', 'to_count'], right_on = ['central','contact', 'to_count'])

In [None]:
for central_group in central_groups:
    print(central_group)
    bars1 = df[df.central == central_group]['alignment']
    bars2 = df[df.central == central_group]['avg_fragment']
    bars3 = df[df.central == central_group]['coordinate_df']
        
    bar_width = 0.25  # the width of the bars
    r1 = np.arange(len(bars1))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]

    # make that plot
    fig, ax = plt.subplots()
    rects1 = ax.bar(r1, bars1, bar_width, label='Alignment')
    rects2 = ax.bar(r2, bars2, bar_width, label='avg_fragment')
    rects3 = ax.bar(r3, bars3, bar_width, label='coordinate_df')

    ax.set_xticks([r + bar_width for r in range(len(bars1))])
    ax.set_xticklabels(df[df.central == central_group]['contact'])
    
    plt.title('Prep comp times ' + central_group)
    
    plt.legend()
    
    plt.savefig("results/figures/Prep_times_" + central_group + ".svg", format="svg")
    plt.show()

In [None]:
# sort df
df["total"] = df["alignment"] + df["avg_fragment"] + df["coordinate_df"]
df = df.sort_values("total", ascending=False)

df.to_hdf('ready_pre_density_comp.hdf', 'key')

In [None]:
i = 0
r1 = []
xtick_labels = []

fig, ax = plt.subplots(figsize=(9,5))
    
for _, row in df.iterrows():
    bars1 = row['alignment']
    bars2 = row['avg_fragment']
    bars3 = row['coordinate_df']
        
    bar_width = 0.5  # the width of the bars
    r1.append(i)

    # make that plot
    rects1 = plt.bar(i, bars1, bar_width, color="tab:blue")
    rects2 = plt.bar(i, bars2, bar_width, bottom=bars1, color="tab:orange")
    rects3 = plt.bar(i, bars3, bar_width, bottom=bars1+bars2, color="tab:green")

    xtick_labels.append(row["central"] + "-" + row['contact'])
    i+=1


    
plt.xticks(r1, xtick_labels, rotation=90)

plt.title('Prep comp times')
plt.subplots_adjust(bottom=0.3)

ax.set_xlabel("Pair")
ax.set_ylabel("Computational time (s)")

ax2 = ax.twinx()
ax2.set_ylabel("Amount")

line = ax2.plot(range(len(df)), df["amount_structures"], color="red", label="no unique fragments")

plt.legend((rects1[0], rects2[0], rects3[0], line[0]), ('alignment', 'avg_fragment', 'coordinate_df', 'No. fragments'))

plt.savefig("results/figures/Prep_times_total.svg", format="svg")

plt.show()

In [None]:
jit_df = pd.read_csv('results/coordinate_df_jit.csv')


combined = pd.merge(jit_df, df,  how='left', left_on=['contact','central', 'to_count'], right_on = ['contact','central', 'to_count', ])

combined["total_jit"] = combined["alignment"] + combined["avg_fragment"] + combined["coordinate_df_jit"]
combined = combined.sort_values("total", ascending=False)

combined["diff"] = combined["coordinate_df"] - combined["coordinate_df_jit"]

In [None]:
i = 0
r1 = []
xtick_labels = []

fig, ax = plt.subplots(figsize=(9,5))
    
for _, row in combined.iterrows():
    bars1 = row['alignment']
    bars2 = row['avg_fragment']
    bars3 = row['coordinate_df_jit']
        
    bar_width = 0.5  # the width of the bars
    r1.append(i)

    # make that plot
    rects1 = plt.bar(i, bars1, bar_width, color="tab:blue")
    rects2 = plt.bar(i, bars2, bar_width, bottom=bars1, color="tab:orange")
    rects3 = plt.bar(i, bars3, bar_width, bottom=bars1+bars2, color="tab:green")

    xtick_labels.append(row["central"] + "-" + row['contact'])
    i+=1

plt.xticks(r1, xtick_labels, rotation=90)

plt.title('Prep comp times')
plt.subplots_adjust(bottom=0.3)

ax.set_xlabel("Pair")
ax.set_ylabel("Computational time (s)")

ax2 = ax.twinx()
ax2.set_ylabel("Amount")

line = ax2.plot(range(len(df)), df["amount_structures"], color="red", label="no unique fragments")

plt.legend((rects1[0], rects2[0], rects3[0], line[0]), ('alignment', 'avg_fragment', 'coordinate_df_jit', 'No. fragments'))

plt.savefig("results/figures/Prep_times_total.svg", format="svg")

plt.show()

# some code to check if labels are the same in the fragments of all the pairs

In [None]:
central_group = "RC6F5"
    
def plot_fragment_with_labels(central_group, contact_group):
    datafile = "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
    csvfile =  "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.csv"

    fp = open(csvfile)
    labels = fp.readline().strip().split(',')
    atoms = fp.readline().strip().split(',')
    fp.close()

    to_delete = []
    for i in range(len(labels)):
        if not "LAB" in labels[i]:
            to_delete.append(i)

    for i in reversed(to_delete):
        del labels[i]
        del atoms[i]

    fp = open(datafile)
    line = fp.readline()
    line = fp.readline()

    firstfragment = True
    dictionary = {}

    while firstfragment:                 
        information = line.split()
        x, y, z = float(information[1]), float(information[2]), float(information[3])
        dictionary[information[0].strip("%")] = [x,y,z]

        line = fp.readline()
        if "**FRAG**" in line:
            firstfragment = False

    fp.close()

    print(atoms)
    print(dictionary)

    fig = plt.figure()
    ax: Axes3D = fig.add_subplot(111, projection='3d')

    for i, atom in enumerate(atoms):
        x,y,z = dictionary[atom][0], dictionary[atom][1], dictionary[atom][2]

        if 'R' in atom:
            color = 'green'
            label = 'R'
        elif 'H' in atom:
            color = 'grey'
            label = 'H'
        elif 'O' in atom:
            color='red'
            label = 'O'
        elif 'F' in atom:
            color='orchid'
            label = 'F'
        elif 'N' in atom:
            color='blue'
            label = 'N'
        else:
            color = 'black'
            label ='C'

        ax.scatter(x,y,z, color=color)

        ax.text(x+0.01, y+0.01, z+0.01, labels[i])

    ax.set_xlabel('X axis')
    ax.set_ylabel('Y axis')
    ax.set_zlabel('Z axis')

    plt.title(central_group + "-" + contact_group)
    plt.show()

In [None]:
plot_fragment_with_labels(central_group="RC6F5", contact_group="C2CH2")