In [5]:
%matplotlib notebook

# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2

import os
import time
import numpy as np
import csv
import pandas as pd

from tqdm import tqdm

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from classes.Settings import Settings

from helpers.alignment_helpers import (alignment_dict, calc_rmse, kabsch_align, perform_rotations,
                                       perform_translation, read_coord_file, read_raw_data)
from align_kabsch import prepare_data, do_kabsch_align

from helpers.geometry_helpers import make_coordinate_df, average_fragment, add_model_methyl
from helpers.density_helpers import prepare_df

central_groups = ["RCOMe", "RNO2", "ArCI", "NO3", "RC6F5", "H2O", "RC6H5"]
contact_groups = ["CF", "RCN", "R2CO", "XH", "CCH3", "C2CH2", "RC6H5", "ArCH"]  #
to_count =       ["F",   "N",    "O",   "H",   "H",     "H", "centroid", "H"] #, 


central_groups = ["RCOMe"]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Count the structures

In [6]:
count = False
counts = []

if count:
    with open('results/amounts_structures.csv', 'a', newline='') as resultsfile:
        writer = csv.writer(resultsfile)
        writer.writerow(['central', 'contact', 'amount_cif', 'amount_structures'])
        
        for central_group in central_groups:

            for to_count_contact, contact_group in zip(to_count, contact_groups):
                datafile = ".\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"

                ids = []

                with open(datafile, 'r') as resultsFile:
                    line = "hoi"
                    while line:
                        line = resultsFile.readline()
                        if "**" in line:
                            ids.append(line.split("**")[0])

                    print(central_group, contact_group, len(ids), len(set(ids)))
                    counts.append(len(ids))

                    writer.writerow([central_group, contact_group, len(ids), len(set(ids))])

# Data Gathering

In [9]:
run_everything_again = True

if run_everything_again:
    with open('results/redo_coordinate_df.csv', 'a', newline='') as resultsfile:
#         writer = csv.writer(resultsfile)
#         writer.writerow(['central', 'contact', 'to_count', 'coordinate_df'])

        for central_group in central_groups:
            for to_count_contact, contact_group in zip(to_count, contact_groups):
             
                print(central_group, contact_group, to_count_contact)

                datafile = ".\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
                result1 = ".\\results\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5\\"\
                          + central_group + "_" + contact_group + "_kabsch_aligned.csv"

                settings = Settings(datafile)
                settings.set_atom_to_count(to_count_contact)

                t0_alignment = time.time()
                do_kabsch_align(datafile, settings)
                t1_alignment = time.time()
                alignment_time = t1_alignment - t0_alignment

                t0_avg_frag = time.time()
                
                aligned_fragments_df = pd.read_csv(settings.get_kabsch_aligned_csv_filename())
                avg_frag = average_fragment(aligned_fragments_df, settings)

                if central_group == "RCOMe":
                    avg_frag = add_model_methyl(fragment=avg_frag, settings=settings)

                avg_frag.to_csv(settings.get_avg_frag_filename(), index=False)
                t1_avg_frag = time.time()
                avg_fragment_time = t1_avg_frag - t0_avg_frag
                
                t0_coordinate_df = time.time()
                df = aligned_fragments_df[aligned_fragments_df.label == "-"]
            
                coordinate_df = make_coordinate_df(df, settings, avg_frag)
                
                t1_coordinate_df = time.time()
                coordinate_df_time = t1_coordinate_df - t0_coordinate_df
                
                print(central_group, contact_group, to_count_contact, alignment_time, avg_fragment_time, coordinate_df_time)
                
                
#                 writer.writerow([central_group, contact_group, to_count_contact, coordinate_df_time])

RCOMe CF F
Pandas is reading csv...


  0%|                                                                                         | 0/7354 [00:00<?, ?it/s]

Done
Applying Kabsch Algorithm...


100%|█████████████████████████████████████████████████████████████████████████████| 7354/7354 [00:33<00:00, 217.09it/s]


Adding model CH3 group... Done
Searching for nearest atom from central group...
Atoms in contact group: 2 atom to count:  F
Coordinate df length: 7355
Coordinate df is made, duration: 4.483382225036621 s
RCOMe CF F 40.46534037590027 0.5308012962341309 4.551450490951538
RCOMe RCN N
Pandas is reading csv...


  1%|█                                                                              | 25/1873 [00:00<00:07, 247.18it/s]

Done
Applying Kabsch Algorithm...


100%|█████████████████████████████████████████████████████████████████████████████| 1873/1873 [00:08<00:00, 225.06it/s]


Adding model CH3 group... Done
Searching for nearest atom from central group...
Atoms in contact group: 2 atom to count:  N
Coordinate df length: 1874
Coordinate df is made, duration: 0.11573076248168945 s
RCOMe RCN N 10.278057336807251 0.22608518600463867 0.12371277809143066
RCOMe R2CO O
Pandas is reading csv...
Done


  0%|                                                                              | 28/63129 [00:00<03:54, 268.90it/s]

Applying Kabsch Algorithm...


100%|███████████████████████████████████████████████████████████████████████████| 63129/63129 [05:10<00:00, 203.29it/s]


Adding model CH3 group... Done
Searching for nearest atom from central group...
Atoms in contact group: 2 atom to count:  O
Coordinate df length: 63130
Coordinate df is made, duration: 3.587888717651367 s
RCOMe R2CO O 363.9013981819153 1.9477989673614502 3.6982920169830322
RCOMe XH H
Pandas is reading csv...
Done


  0%|                                                                              | 27/41429 [00:00<02:37, 262.75it/s]

Applying Kabsch Algorithm...


100%|███████████████████████████████████████████████████████████████████████████| 41429/41429 [03:07<00:00, 220.86it/s]


Adding model CH3 group... Done
Searching for nearest atom from central group...
Atoms in contact group: 2 atom to count:  H
Coordinate df length: 41430
Coordinate df is made, duration: 2.2023210525512695 s
RCOMe XH H 220.5341396331787 1.150475025177002 2.2496469020843506
RCOMe CCH3 H
Pandas is reading csv...
Done


  0%|                                                                              | 19/36735 [00:00<03:21, 182.18it/s]

Applying Kabsch Algorithm...


100%|███████████████████████████████████████████████████████████████████████████| 36735/36735 [02:49<00:00, 216.68it/s]


Adding model CH3 group... Done
Searching for nearest atom from central group...
Atoms in contact group: 5 atom to count:  H
Coordinate df length: 110208
Coordinate df is made, duration: 21.120024919509888 s
RCOMe CCH3 H 207.12408137321472 1.7157833576202393 21.209062337875366
RCOMe C2CH2 H
Pandas is reading csv...
Done


  0%|                                                                              | 19/21098 [00:00<01:53, 186.12it/s]

Applying Kabsch Algorithm...


100%|███████████████████████████████████████████████████████████████████████████| 21098/21098 [01:51<00:00, 189.32it/s]


Adding model CH3 group... Done
Searching for nearest atom from central group...
Atoms in contact group: 5 atom to count:  H
Coordinate df length: 42198
Coordinate df is made, duration: 8.546244382858276 s
RCOMe C2CH2 H 138.15983200073242 1.0722732543945312 8.617626905441284
RCOMe RC6H5 centroid
Pandas is reading csv...
Done


  0%|                                                                              | 24/34069 [00:00<02:27, 231.07it/s]

Applying Kabsch Algorithm...


100%|███████████████████████████████████████████████████████████████████████████| 34069/34069 [03:14<00:00, 175.05it/s]


Adding model CH3 group... Done
Searching for nearest atom from central group...
Atoms in contact group: 12 atom to count:  centroid
Coordinate df length: 34070
Coordinate df is made, duration: 23.669304370880127 s
RCOMe RC6H5 centroid 259.3950071334839 2.9902634620666504 23.85352063179016
RCOMe ArCH H
Pandas is reading csv...
Done


  0%|                                                                              | 18/54940 [00:00<05:15, 174.31it/s]

Applying Kabsch Algorithm...


100%|███████████████████████████████████████████████████████████████████████████| 54940/54940 [05:51<00:00, 156.50it/s]


Adding model CH3 group... Done
Searching for nearest atom from central group...
Atoms in contact group: 9 atom to count:  H
Coordinate df length: 164823
Coordinate df is made, duration: 26.36086654663086 s
RCOMe ArCH H 461.28501296043396 2.7009198665618896 26.585445880889893


# Analyzing data

In [None]:
df = pd.read_csv('results/pre_density_comp_time.csv')

df_count = pd.read_csv('results/amounts_structures.csv')
df = pd.merge(df, df_count,  how='left', left_on=['central','contact'], right_on = ['central','contact'])

real_coordinate_time = pd.read_csv('results/redo_coordinate_df.csv')

real_coordinate_time.columns = ['central', 'contact', 'to_count', 'coordinate_df_real']
df = pd.merge(df, real_coordinate_time,  how='left', left_on=['central','contact', 'to_count'], right_on = ['central','contact', 'to_count'])

In [None]:
for central_group in central_groups:
    print(central_group)
    bars1 = df[df.central == central_group]['alignment']
    bars2 = df[df.central == central_group]['avg_fragment']
    bars3 = df[df.central == central_group]['coordinate_df']
        
    bar_width = 0.25  # the width of the bars
    r1 = np.arange(len(bars1))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]

    # make that plot
    fig, ax = plt.subplots()
    rects1 = ax.bar(r1, bars1, bar_width, label='Alignment')
    rects2 = ax.bar(r2, bars2, bar_width, label='avg_fragment')
    rects3 = ax.bar(r3, bars3, bar_width, label='coordinate_df')

    ax.set_xticks([r + bar_width for r in range(len(bars1))])
    ax.set_xticklabels(df[df.central == central_group]['contact'])
    
    plt.title('Prep comp times ' + central_group)
    
    plt.legend()
    
    plt.savefig("results/figures/Prep_times_" + central_group + ".svg", format="svg")
    plt.show()

In [None]:
# sort df
df["total"] = df["alignment"] + df["avg_fragment"] + df["coordinate_df"]
df = df.sort_values("total", ascending=False)

df.to_hdf('ready_pre_density_comp.hdf', 'key')

In [None]:
i = 0
r1 = []
xtick_labels = []

fig, ax = plt.subplots(figsize=(9,5))
    
for _, row in df.iterrows():
    bars1 = row['alignment']
    bars2 = row['avg_fragment']
    bars3 = row['coordinate_df']
        
    bar_width = 0.5  # the width of the bars
    r1.append(i)

    # make that plot
    rects1 = plt.bar(i, bars1, bar_width, color="tab:blue")
    rects2 = plt.bar(i, bars2, bar_width, bottom=bars1, color="tab:orange")
    rects3 = plt.bar(i, bars3, bar_width, bottom=bars1+bars2, color="tab:green")

    xtick_labels.append(row["central"] + "-" + row['contact'])
    i+=1


    
plt.xticks(r1, xtick_labels, rotation=90)

plt.title('Prep comp times')
plt.subplots_adjust(bottom=0.3)

ax.set_xlabel("Pair")
ax.set_ylabel("Computational time (s)")

ax2 = ax.twinx()
ax2.set_ylabel("Amount")

line = ax2.plot(range(len(df)), df["amount_structures"], color="red", label="no unique fragments")

plt.legend((rects1[0], rects2[0], rects3[0], line[0]), ('alignment', 'avg_fragment', 'coordinate_df', 'No. fragments'))

plt.savefig("results/figures/Prep_times_total.svg", format="svg")

plt.show()

In [None]:
jit_df = pd.read_csv('results/coordinate_df_jit.csv')


combined = pd.merge(jit_df, df,  how='left', left_on=['contact','central', 'to_count'], right_on = ['contact','central', 'to_count', ])

combined["total_jit"] = combined["alignment"] + combined["avg_fragment"] + combined["coordinate_df_jit"]
combined = combined.sort_values("total", ascending=False)

combined["diff"] = combined["coordinate_df"] - combined["coordinate_df_jit"]

In [None]:
i = 0
r1 = []
xtick_labels = []

fig, ax = plt.subplots(figsize=(9,5))
    
for _, row in combined.iterrows():
    bars1 = row['alignment']
    bars2 = row['avg_fragment']
    bars3 = row['coordinate_df_jit']
        
    bar_width = 0.5  # the width of the bars
    r1.append(i)

    # make that plot
    rects1 = plt.bar(i, bars1, bar_width, color="tab:blue")
    rects2 = plt.bar(i, bars2, bar_width, bottom=bars1, color="tab:orange")
    rects3 = plt.bar(i, bars3, bar_width, bottom=bars1+bars2, color="tab:green")

    xtick_labels.append(row["central"] + "-" + row['contact'])
    i+=1

plt.xticks(r1, xtick_labels, rotation=90)

plt.title('Prep comp times')
plt.subplots_adjust(bottom=0.3)

ax.set_xlabel("Pair")
ax.set_ylabel("Computational time (s)")

ax2 = ax.twinx()
ax2.set_ylabel("Amount")

line = ax2.plot(range(len(df)), df["amount_structures"], color="red", label="no unique fragments")

plt.legend((rects1[0], rects2[0], rects3[0], line[0]), ('alignment', 'avg_fragment', 'coordinate_df_jit', 'No. fragments'))

plt.savefig("results/figures/Prep_times_total.svg", format="svg")

plt.show()

# some code to check if labels are the same in the fragments of all the pairs

In [None]:
central_group = "RC6F5"
    
def plot_fragment_with_labels(central_group, contact_group):
    datafile = "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.cor"
    csvfile =  "..\\data\\" + central_group + "\\" + central_group + "_" + contact_group + "_vdw.5.csv"

    fp = open(csvfile)
    labels = fp.readline().strip().split(',')
    atoms = fp.readline().strip().split(',')
    fp.close()

    to_delete = []
    for i in range(len(labels)):
        if not "LAB" in labels[i]:
            to_delete.append(i)

    for i in reversed(to_delete):
        del labels[i]
        del atoms[i]

    fp = open(datafile)
    line = fp.readline()
    line = fp.readline()

    firstfragment = True
    dictionary = {}

    while firstfragment:                 
        information = line.split()
        x, y, z = float(information[1]), float(information[2]), float(information[3])
        dictionary[information[0].strip("%")] = [x,y,z]

        line = fp.readline()
        if "**FRAG**" in line:
            firstfragment = False

    fp.close()

    print(atoms)
    print(dictionary)

    fig = plt.figure()
    ax: Axes3D = fig.add_subplot(111, projection='3d')

    for i, atom in enumerate(atoms):
        x,y,z = dictionary[atom][0], dictionary[atom][1], dictionary[atom][2]

        if 'R' in atom:
            color = 'green'
            label = 'R'
        elif 'H' in atom:
            color = 'grey'
            label = 'H'
        elif 'O' in atom:
            color='red'
            label = 'O'
        elif 'F' in atom:
            color='orchid'
            label = 'F'
        elif 'N' in atom:
            color='blue'
            label = 'N'
        else:
            color = 'black'
            label ='C'

        ax.scatter(x,y,z, color=color)

        ax.text(x+0.01, y+0.01, z+0.01, labels[i])

    ax.set_xlabel('X axis')
    ax.set_ylabel('Y axis')
    ax.set_zlabel('Z axis')

    plt.title(central_group + "-" + contact_group)
    plt.show()

In [None]:
plot_fragment_with_labels(central_group="RC6F5", contact_group="C2CH2")