# Data Processing

In [1]:
from pathlib import Path
import pandas as pd
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import json

from processing_utils import xyz_to_df, pairwise_distance, average_abs_dist, xyz_to_np, read_energy_gradient

HARTREE_TO_KCAL_MOL = 627.509

In [2]:
experiments_path = Path("experiments_no_opt")
processed_results_path = Path("processed_results")
molecules_path = Path("uniques_100_molecules_42_seed")

## Parse Results

### Parse Experiment Results from xtb Output Files

In [3]:
results_df = pd.DataFrame(
    columns=[
        "molecule",
        "parameter",
        "factor",
        "parameter value",
        "atomisation energy (Hartrees)",
        "atomisation energy (kcal/mol)",
        "total energy (Hartrees)",
        "total energy (kcal/mol)",
        # "pairwise distance",
        # "atoms",
        "max atomic force",
    ]
)

for parameter_path in tqdm(
    experiments_path.glob("*"), total=len(list(experiments_path.glob("*")))
):
    for run_path in parameter_path.glob("*"):
        parameter_factor, parameter_value = run_path.name.split("_")
        num_skipped = 0
        for molecule in run_path.glob("*"):
            if not molecule.is_dir():
                continue

            # Atomisation energy
            with open(molecule / "stdout.txt") as f:
                output_file = f.readlines()
            total_energy, atomisation_energy = None, None
            for line in output_file:
                if total_energy is not None and atomisation_energy is not None:
                    break
                elif "total energy" in line:
                    total_energy = float(re.sub(r"[^\d.]", "", line))
                elif "atomisation energy" in line:
                    atomisation_energy = float(re.sub(r"[^\d.]", "", line))

            # Geometry
            pairwise_distances_arr = None
            if (molecule / "xtbopt.xyz").exists():
                geometry_arr, atoms = xyz_to_np(molecule / "xtbopt.xyz")
                pairwise_distances_arr = pairwise_distance(geometry_arr)

            # Force
            if (molecule / f"{molecule.stem}.engrad").exists():
                _, _, _, atomic_forces = read_energy_gradient(
                    molecule / f"{molecule.stem}.engrad"
                )

            if (
                atomisation_energy is None
                or total_energy is None
                # or pairwise_distances_arr is None
                or atomic_forces is None
            ):
                num_skipped += 1
                continue

            results_df.loc[len(results_df)] = [
                molecule.name,
                parameter_path.name,
                parameter_factor,
                parameter_value,
                atomisation_energy,
                atomisation_energy * HARTREE_TO_KCAL_MOL,
                total_energy,
                total_energy * HARTREE_TO_KCAL_MOL,
                # json.dumps(pairwise_distances_arr.tolist()),
                # atoms,
                np.max(np.abs(atomic_forces)),
            ]
        print(
            f"Skipped {num_skipped} / {len(list(run_path.glob('*')))} molecules in {parameter_path.name} with parameter {parameter_factor} and value {parameter_value}."
        )

  0%|          | 0/25 [00:00<?, ?it/s]

Skipped 0 / 89 molecules in gam3d2 with parameter 1.5 and value 0.375.
Skipped 0 / 89 molecules in gam3d2 with parameter 1.4 and value 0.35.
Skipped 0 / 89 molecules in gam3d2 with parameter 1.3 and value 0.325.
Skipped 0 / 89 molecules in gam3d2 with parameter 0.9 and value 0.225.
Skipped 0 / 89 molecules in gam3d2 with parameter 1.1 and value 0.275.
Skipped 0 / 89 molecules in gam3d2 with parameter 1.2 and value 0.3.
Skipped 0 / 89 molecules in gam3d2 with parameter 0.5 and value 0.125.
Skipped 0 / 89 molecules in gam3d2 with parameter 0.7 and value 0.175.
Skipped 0 / 89 molecules in gam3d2 with parameter 0.6 and value 0.15.


  4%|▍         | 1/25 [00:00<00:23,  1.04it/s]

Skipped 0 / 89 molecules in gam3d2 with parameter 0.8 and value 0.2.
Skipped 0 / 89 molecules in gam3d2 with parameter 1.0 and value 0.25.
Skipped 0 / 89 molecules in ks with parameter 1.3 and value 2.405.
Skipped 0 / 89 molecules in ks with parameter 0.6 and value 1.11.
Skipped 0 / 89 molecules in ks with parameter 0.7 and value 1.295.
Skipped 0 / 89 molecules in ks with parameter 0.8 and value 1.48.
Skipped 0 / 89 molecules in ks with parameter 1.2 and value 2.22.
Skipped 0 / 89 molecules in ks with parameter 0.9 and value 1.665.
Skipped 0 / 89 molecules in ks with parameter 1.0 and value 1.85.
Skipped 1 / 89 molecules in ks with parameter 1.4 and value 2.59.
Skipped 3 / 89 molecules in ks with parameter 1.5 and value 2.775.
Skipped 0 / 89 molecules in ks with parameter 0.5 and value 0.925.


  8%|▊         | 2/25 [00:01<00:22,  1.03it/s]

Skipped 0 / 89 molecules in ks with parameter 1.1 and value 2.035.
Skipped 0 / 89 molecules in kp with parameter 1.3 and value 2.899.
Skipped 0 / 89 molecules in kp with parameter 0.7 and value 1.561.
Skipped 0 / 89 molecules in kp with parameter 0.6 and value 1.338.
Skipped 1 / 89 molecules in kp with parameter 1.5 and value 3.345.
Skipped 0 / 89 molecules in kp with parameter 1.4 and value 3.122.
Skipped 0 / 89 molecules in kp with parameter 1.2 and value 2.676.
Skipped 2 / 89 molecules in kp with parameter 0.5 and value 1.115.
Skipped 0 / 89 molecules in kp with parameter 1.1 and value 2.453.


 12%|█▏        | 3/25 [00:02<00:21,  1.04it/s]

Skipped 0 / 89 molecules in kp with parameter 0.8 and value 1.784.
Skipped 0 / 89 molecules in kp with parameter 0.9 and value 2.007.
Skipped 0 / 89 molecules in kp with parameter 1.0 and value 2.23.
Skipped 0 / 89 molecules in kpd with parameter 0.6 and value 1.2.
Skipped 0 / 89 molecules in kpd with parameter 0.7 and value 1.4.
Skipped 0 / 89 molecules in kpd with parameter 1.5 and value 3.0.
Skipped 0 / 89 molecules in kpd with parameter 1.4 and value 2.8.
Skipped 0 / 89 molecules in kpd with parameter 0.5 and value 1.0.
Skipped 0 / 89 molecules in kpd with parameter 1.3 and value 2.6.
Skipped 0 / 89 molecules in kpd with parameter 0.8 and value 1.6.
Skipped 0 / 89 molecules in kpd with parameter 1.2 and value 2.4.
Skipped 0 / 89 molecules in kpd with parameter 0.9 and value 1.8.


 16%|█▌        | 4/25 [00:03<00:20,  1.01it/s]

Skipped 0 / 89 molecules in kpd with parameter 1.0 and value 2.0.
Skipped 0 / 89 molecules in kpd with parameter 1.1 and value 2.2.
Skipped 0 / 89 molecules in a1 with parameter 1.0 and value 0.52.
Skipped 0 / 89 molecules in a1 with parameter 1.1 and value 0.572.
Skipped 0 / 89 molecules in a1 with parameter 1.4 and value 0.728.
Skipped 0 / 89 molecules in a1 with parameter 0.8 and value 0.416.
Skipped 0 / 89 molecules in a1 with parameter 1.3 and value 0.676.
Skipped 0 / 89 molecules in a1 with parameter 0.6 and value 0.312.
Skipped 0 / 89 molecules in a1 with parameter 0.9 and value 0.468.
Skipped 0 / 89 molecules in a1 with parameter 0.5 and value 0.26.
Skipped 0 / 89 molecules in a1 with parameter 1.2 and value 0.624.
Skipped 0 / 89 molecules in a1 with parameter 1.5 and value 0.78.


 20%|██        | 5/25 [00:04<00:19,  1.00it/s]

Skipped 0 / 89 molecules in a1 with parameter 0.7 and value 0.364.
Skipped 0 / 89 molecules in aesexp with parameter 1.3 and value 5.2.
Skipped 0 / 89 molecules in aesexp with parameter 1.4 and value 5.6.
Skipped 0 / 89 molecules in aesexp with parameter 0.8 and value 3.2.
Skipped 0 / 89 molecules in aesexp with parameter 0.7 and value 2.8.
Skipped 0 / 89 molecules in aesexp with parameter 1.2 and value 4.8.
Skipped 0 / 89 molecules in aesexp with parameter 0.6 and value 2.4.
Skipped 0 / 89 molecules in aesexp with parameter 1.5 and value 6.0.
Skipped 0 / 89 molecules in aesexp with parameter 0.5 and value 2.0.


 24%|██▍       | 6/25 [00:05<00:19,  1.01s/it]

Skipped 0 / 89 molecules in aesexp with parameter 0.9 and value 3.6.
Skipped 0 / 89 molecules in aesexp with parameter 1.0 and value 4.0.
Skipped 0 / 89 molecules in aesexp with parameter 1.1 and value 4.4.
Skipped 0 / 89 molecules in gam3p with parameter 0.9 and value 0.45.
Skipped 0 / 89 molecules in gam3p with parameter 0.8 and value 0.4.
Skipped 0 / 89 molecules in gam3p with parameter 1.2 and value 0.6.
Skipped 0 / 89 molecules in gam3p with parameter 0.6 and value 0.3.
Skipped 0 / 89 molecules in gam3p with parameter 1.4 and value 0.7.
Skipped 0 / 89 molecules in gam3p with parameter 1.3 and value 0.65.
Skipped 0 / 89 molecules in gam3p with parameter 0.5 and value 0.25.
Skipped 0 / 89 molecules in gam3p with parameter 1.0 and value 0.5.
Skipped 0 / 89 molecules in gam3p with parameter 0.7 and value 0.35.


 28%|██▊       | 7/25 [00:07<00:18,  1.03s/it]

Skipped 0 / 89 molecules in gam3p with parameter 1.5 and value 0.75.
Skipped 0 / 89 molecules in gam3p with parameter 1.1 and value 0.55.
Skipped 0 / 89 molecules in gam3d1 with parameter 1.5 and value 0.375.
Skipped 0 / 89 molecules in gam3d1 with parameter 1.4 and value 0.35.
Skipped 0 / 89 molecules in gam3d1 with parameter 1.3 and value 0.325.
Skipped 0 / 89 molecules in gam3d1 with parameter 0.9 and value 0.225.
Skipped 0 / 89 molecules in gam3d1 with parameter 1.1 and value 0.275.
Skipped 0 / 89 molecules in gam3d1 with parameter 1.2 and value 0.3.
Skipped 0 / 89 molecules in gam3d1 with parameter 0.5 and value 0.125.
Skipped 0 / 89 molecules in gam3d1 with parameter 0.7 and value 0.175.
Skipped 0 / 89 molecules in gam3d1 with parameter 0.6 and value 0.15.
Skipped 0 / 89 molecules in gam3d1 with parameter 0.8 and value 0.2.


 32%|███▏      | 8/25 [00:08<00:17,  1.03s/it]

Skipped 0 / 89 molecules in gam3d1 with parameter 1.0 and value 0.25.
Skipped 0 / 89 molecules in s8 with parameter 1.2 and value 3.24.
Skipped 0 / 89 molecules in s8 with parameter 0.7 and value 1.89.
Skipped 0 / 89 molecules in s8 with parameter 1.4 and value 3.78.
Skipped 0 / 89 molecules in s8 with parameter 0.9 and value 2.43.
Skipped 0 / 89 molecules in s8 with parameter 1.3 and value 3.51.
Skipped 0 / 89 molecules in s8 with parameter 1.0 and value 2.7.
Skipped 0 / 89 molecules in s8 with parameter 0.5 and value 1.35.
Skipped 0 / 89 molecules in s8 with parameter 1.1 and value 2.97.
Skipped 0 / 89 molecules in s8 with parameter 0.6 and value 1.62.


 36%|███▌      | 9/25 [00:09<00:16,  1.05s/it]

Skipped 0 / 89 molecules in s8 with parameter 0.8 and value 2.16.
Skipped 0 / 89 molecules in s8 with parameter 1.5 and value 4.05.
Skipped 0 / 89 molecules in ipeashift with parameter 1.4 and value 2.492966.
Skipped 0 / 89 molecules in ipeashift with parameter 0.7 and value 1.246483.
Skipped 0 / 89 molecules in ipeashift with parameter 1.5 and value 2.671035.
Skipped 0 / 89 molecules in ipeashift with parameter 1.0 and value 1.78069.
Skipped 0 / 89 molecules in ipeashift with parameter 1.3 and value 2.314897.
Skipped 0 / 89 molecules in ipeashift with parameter 1.1 and value 1.958759.
Skipped 0 / 89 molecules in ipeashift with parameter 0.6 and value 1.068414.
Skipped 0 / 89 molecules in ipeashift with parameter 1.2 and value 2.136828.


 40%|████      | 10/25 [00:10<00:16,  1.09s/it]

Skipped 0 / 89 molecules in ipeashift with parameter 0.8 and value 1.424552.
Skipped 0 / 89 molecules in ipeashift with parameter 0.9 and value 1.602621.
Skipped 0 / 89 molecules in ipeashift with parameter 0.5 and value 0.890345.
Skipped 0 / 89 molecules in aesshift with parameter 0.9 and value 1.08.
Skipped 0 / 89 molecules in aesshift with parameter 0.8 and value 0.96.
Skipped 0 / 89 molecules in aesshift with parameter 1.0 and value 1.2.
Skipped 0 / 89 molecules in aesshift with parameter 1.5 and value 1.8.
Skipped 0 / 89 molecules in aesshift with parameter 1.4 and value 1.68.
Skipped 0 / 89 molecules in aesshift with parameter 0.5 and value 0.6.
Skipped 0 / 89 molecules in aesshift with parameter 0.6 and value 0.72.
Skipped 0 / 89 molecules in aesshift with parameter 1.2 and value 1.44.
Skipped 0 / 89 molecules in aesshift with parameter 0.7 and value 0.84.


 44%|████▍     | 11/25 [00:11<00:15,  1.10s/it]

Skipped 0 / 89 molecules in aesshift with parameter 1.3 and value 1.56.
Skipped 0 / 89 molecules in aesshift with parameter 1.1 and value 1.32.
Skipped 0 / 89 molecules in aesdmp5 with parameter 1.3 and value 5.2.
Skipped 0 / 89 molecules in aesdmp5 with parameter 1.4 and value 5.6.
Skipped 0 / 89 molecules in aesdmp5 with parameter 0.8 and value 3.2.
Skipped 0 / 89 molecules in aesdmp5 with parameter 0.7 and value 2.8.
Skipped 0 / 89 molecules in aesdmp5 with parameter 1.2 and value 4.8.
Skipped 0 / 89 molecules in aesdmp5 with parameter 0.6 and value 2.4.
Skipped 0 / 89 molecules in aesdmp5 with parameter 1.5 and value 6.0.
Skipped 0 / 89 molecules in aesdmp5 with parameter 0.5 and value 2.0.
Skipped 0 / 89 molecules in aesdmp5 with parameter 0.9 and value 3.6.
Skipped 0 / 89 molecules in aesdmp5 with parameter 1.0 and value 4.0.


 48%|████▊     | 12/25 [00:12<00:14,  1.12s/it]

Skipped 0 / 89 molecules in aesdmp5 with parameter 1.1 and value 4.4.
Skipped 0 / 89 molecules in kexp with parameter 1.4 and value 2.1.
Skipped 0 / 89 molecules in kexp with parameter 0.5 and value 0.75.
Skipped 0 / 89 molecules in kexp with parameter 0.6 and value 0.9.
Skipped 0 / 89 molecules in kexp with parameter 1.0 and value 1.5.
Skipped 0 / 89 molecules in kexp with parameter 1.3 and value 1.95.
Skipped 0 / 89 molecules in kexp with parameter 1.5 and value 2.25.
Skipped 0 / 89 molecules in kexp with parameter 0.7 and value 1.05.
Skipped 0 / 89 molecules in kexp with parameter 1.2 and value 1.8.
Skipped 0 / 89 molecules in kexp with parameter 0.9 and value 1.35.


 52%|█████▏    | 13/25 [00:13<00:13,  1.15s/it]

Skipped 0 / 89 molecules in kexp with parameter 1.1 and value 1.65.
Skipped 0 / 89 molecules in kexp with parameter 0.8 and value 1.2.
Skipped 0 / 89 molecules in gam3s with parameter 0.9 and value 0.9.
Skipped 0 / 89 molecules in gam3s with parameter 1.1 and value 1.1.
Skipped 0 / 89 molecules in gam3s with parameter 0.5 and value 0.5.
Skipped 0 / 89 molecules in gam3s with parameter 1.3 and value 1.3.
Skipped 0 / 89 molecules in gam3s with parameter 0.7 and value 0.7.
Skipped 0 / 89 molecules in gam3s with parameter 0.6 and value 0.6.
Skipped 0 / 89 molecules in gam3s with parameter 1.5 and value 1.5.
Skipped 0 / 89 molecules in gam3s with parameter 1.0 and value 1.0.
Skipped 0 / 89 molecules in gam3s with parameter 1.2 and value 1.2.
Skipped 0 / 89 molecules in gam3s with parameter 0.8 and value 0.8.


 56%|█████▌    | 14/25 [00:15<00:12,  1.17s/it]

Skipped 0 / 89 molecules in gam3s with parameter 1.4 and value 1.4.
Skipped 0 / 89 molecules in alphaj with parameter 0.6 and value 1.2.
Skipped 0 / 89 molecules in alphaj with parameter 0.7 and value 1.4.
Skipped 0 / 89 molecules in alphaj with parameter 1.5 and value 3.0.
Skipped 0 / 89 molecules in alphaj with parameter 1.4 and value 2.8.
Skipped 0 / 89 molecules in alphaj with parameter 0.5 and value 1.0.
Skipped 0 / 89 molecules in alphaj with parameter 1.3 and value 2.6.
Skipped 0 / 89 molecules in alphaj with parameter 0.8 and value 1.6.
Skipped 0 / 89 molecules in alphaj with parameter 1.2 and value 2.4.
Skipped 0 / 89 molecules in alphaj with parameter 0.9 and value 1.8.


 60%|██████    | 15/25 [00:16<00:12,  1.22s/it]

Skipped 0 / 89 molecules in alphaj with parameter 1.0 and value 2.0.
Skipped 0 / 89 molecules in alphaj with parameter 1.1 and value 2.2.
Skipped 0 / 89 molecules in kexplight with parameter 0.9 and value 0.9.
Skipped 0 / 89 molecules in kexplight with parameter 1.1 and value 1.1.
Skipped 0 / 89 molecules in kexplight with parameter 0.5 and value 0.5.
Skipped 0 / 89 molecules in kexplight with parameter 1.3 and value 1.3.
Skipped 0 / 89 molecules in kexplight with parameter 0.7 and value 0.7.
Skipped 0 / 89 molecules in kexplight with parameter 0.6 and value 0.6.
Skipped 0 / 89 molecules in kexplight with parameter 1.5 and value 1.5.
Skipped 0 / 89 molecules in kexplight with parameter 1.0 and value 1.0.
Skipped 0 / 89 molecules in kexplight with parameter 1.2 and value 1.2.
Skipped 0 / 89 molecules in kexplight with parameter 0.8 and value 0.8.


 64%|██████▍   | 16/25 [00:17<00:11,  1.25s/it]

Skipped 0 / 89 molecules in kexplight with parameter 1.4 and value 1.4.
Skipped 0 / 89 molecules in s9 with parameter 1.1 and value 5.5.
Skipped 0 / 89 molecules in s9 with parameter 1.4 and value 7.0.
Skipped 0 / 89 molecules in s9 with parameter 0.8 and value 4.0.
Skipped 0 / 89 molecules in s9 with parameter 1.3 and value 6.5.
Skipped 0 / 89 molecules in s9 with parameter 0.6 and value 3.0.
Skipped 0 / 89 molecules in s9 with parameter 0.5 and value 2.5.
Skipped 0 / 89 molecules in s9 with parameter 0.7 and value 3.5.
Skipped 0 / 89 molecules in s9 with parameter 1.5 and value 7.5.
Skipped 0 / 89 molecules in s9 with parameter 0.9 and value 4.5.


 68%|██████▊   | 17/25 [00:19<00:10,  1.27s/it]

Skipped 0 / 89 molecules in s9 with parameter 1.0 and value 5.0.
Skipped 0 / 89 molecules in s9 with parameter 1.2 and value 6.0.
Skipped 0 / 89 molecules in kdiff with parameter 0.6 and value 1.2.
Skipped 0 / 89 molecules in kdiff with parameter 0.7 and value 1.4.
Skipped 0 / 89 molecules in kdiff with parameter 1.5 and value 3.0.
Skipped 0 / 89 molecules in kdiff with parameter 1.4 and value 2.8.
Skipped 0 / 89 molecules in kdiff with parameter 0.5 and value 1.0.
Skipped 0 / 89 molecules in kdiff with parameter 1.3 and value 2.6.
Skipped 0 / 89 molecules in kdiff with parameter 0.8 and value 1.6.
Skipped 0 / 89 molecules in kdiff with parameter 1.2 and value 2.4.
Skipped 0 / 89 molecules in kdiff with parameter 0.9 and value 1.8.
Skipped 0 / 89 molecules in kdiff with parameter 1.0 and value 2.0.


 72%|███████▏  | 18/25 [00:20<00:09,  1.32s/it]

Skipped 0 / 89 molecules in kdiff with parameter 1.1 and value 2.2.
Skipped 0 / 89 molecules in aesrmax with parameter 1.1 and value 5.5.
Skipped 0 / 89 molecules in aesrmax with parameter 1.4 and value 7.0.
Skipped 0 / 89 molecules in aesrmax with parameter 0.8 and value 4.0.
Skipped 0 / 89 molecules in aesrmax with parameter 1.3 and value 6.5.
Skipped 0 / 89 molecules in aesrmax with parameter 0.6 and value 3.0.
Skipped 0 / 89 molecules in aesrmax with parameter 0.5 and value 2.5.
Skipped 0 / 89 molecules in aesrmax with parameter 0.7 and value 3.5.
Skipped 0 / 89 molecules in aesrmax with parameter 1.5 and value 7.5.
Skipped 0 / 89 molecules in aesrmax with parameter 0.9 and value 4.5.


 76%|███████▌  | 19/25 [00:21<00:08,  1.35s/it]

Skipped 0 / 89 molecules in aesrmax with parameter 1.0 and value 5.0.
Skipped 0 / 89 molecules in aesrmax with parameter 1.2 and value 6.0.
Skipped 0 / 89 molecules in aesdmp3 with parameter 1.5 and value 4.5.
Skipped 0 / 89 molecules in aesdmp3 with parameter 0.8 and value 2.4.
Skipped 0 / 89 molecules in aesdmp3 with parameter 1.2 and value 3.6.
Skipped 0 / 89 molecules in aesdmp3 with parameter 1.1 and value 3.3.
Skipped 0 / 89 molecules in aesdmp3 with parameter 0.6 and value 1.8.
Skipped 0 / 89 molecules in aesdmp3 with parameter 1.3 and value 3.9.
Skipped 0 / 89 molecules in aesdmp3 with parameter 0.5 and value 1.5.
Skipped 0 / 89 molecules in aesdmp3 with parameter 1.4 and value 4.2.
Skipped 0 / 89 molecules in aesdmp3 with parameter 0.7 and value 2.1.
Skipped 0 / 89 molecules in aesdmp3 with parameter 0.9 and value 2.7.


 80%|████████  | 20/25 [00:23<00:06,  1.39s/it]

Skipped 0 / 89 molecules in aesdmp3 with parameter 1.0 and value 3.0.
Skipped 0 / 89 molecules in enscale with parameter 0.6 and value 1.2.
Skipped 0 / 89 molecules in enscale with parameter 0.7 and value 1.4.
Skipped 0 / 89 molecules in enscale with parameter 1.5 and value 3.0.
Skipped 0 / 89 molecules in enscale with parameter 1.4 and value 2.8.
Skipped 0 / 89 molecules in enscale with parameter 0.5 and value 1.0.
Skipped 0 / 89 molecules in enscale with parameter 1.3 and value 2.6.
Skipped 0 / 89 molecules in enscale with parameter 0.8 and value 1.6.
Skipped 0 / 89 molecules in enscale with parameter 1.2 and value 2.4.
Skipped 0 / 89 molecules in enscale with parameter 0.9 and value 1.8.


 84%|████████▍ | 21/25 [00:24<00:05,  1.44s/it]

Skipped 0 / 89 molecules in enscale with parameter 1.0 and value 2.0.
Skipped 0 / 89 molecules in enscale with parameter 1.1 and value 2.2.
Skipped 0 / 89 molecules in a2 with parameter 1.1 and value 5.5.
Skipped 0 / 89 molecules in a2 with parameter 1.4 and value 7.0.
Skipped 0 / 89 molecules in a2 with parameter 0.8 and value 4.0.
Skipped 0 / 89 molecules in a2 with parameter 1.3 and value 6.5.
Skipped 0 / 89 molecules in a2 with parameter 0.6 and value 3.0.
Skipped 0 / 89 molecules in a2 with parameter 0.5 and value 2.5.
Skipped 0 / 89 molecules in a2 with parameter 0.7 and value 3.5.
Skipped 0 / 89 molecules in a2 with parameter 1.5 and value 7.5.
Skipped 0 / 89 molecules in a2 with parameter 0.9 and value 4.5.
Skipped 0 / 89 molecules in a2 with parameter 1.0 and value 5.0.


 88%|████████▊ | 22/25 [00:26<00:04,  1.49s/it]

Skipped 0 / 89 molecules in a2 with parameter 1.2 and value 6.0.
Skipped 0 / 89 molecules in kd with parameter 1.3 and value 2.899.
Skipped 0 / 89 molecules in kd with parameter 0.7 and value 1.561.
Skipped 0 / 89 molecules in kd with parameter 0.6 and value 1.338.
Skipped 0 / 89 molecules in kd with parameter 1.5 and value 3.345.
Skipped 0 / 89 molecules in kd with parameter 1.4 and value 3.122.
Skipped 0 / 89 molecules in kd with parameter 1.2 and value 2.676.
Skipped 0 / 89 molecules in kd with parameter 0.5 and value 1.115.
Skipped 0 / 89 molecules in kd with parameter 1.1 and value 2.453.
Skipped 0 / 89 molecules in kd with parameter 0.8 and value 1.784.


 92%|█████████▏| 23/25 [00:28<00:03,  1.53s/it]

Skipped 0 / 89 molecules in kd with parameter 0.9 and value 2.007.
Skipped 0 / 89 molecules in kd with parameter 1.0 and value 2.23.
Skipped 0 / 89 molecules in ksd with parameter 0.6 and value 1.2.
Skipped 0 / 89 molecules in ksd with parameter 0.7 and value 1.4.
Skipped 0 / 89 molecules in ksd with parameter 1.5 and value 3.0.
Skipped 0 / 89 molecules in ksd with parameter 1.4 and value 2.8.
Skipped 0 / 89 molecules in ksd with parameter 0.5 and value 1.0.
Skipped 0 / 89 molecules in ksd with parameter 1.3 and value 2.6.
Skipped 0 / 89 molecules in ksd with parameter 0.8 and value 1.6.
Skipped 0 / 89 molecules in ksd with parameter 1.2 and value 2.4.
Skipped 0 / 89 molecules in ksd with parameter 0.9 and value 1.8.
Skipped 0 / 89 molecules in ksd with parameter 1.0 and value 2.0.


100%|██████████| 25/25 [00:29<00:00,  1.19s/it]

Skipped 0 / 89 molecules in ksd with parameter 1.1 and value 2.2.





In [4]:
# to pickle
results_df.to_csv(processed_results_path / "parsed_results.csv", index=False)

### Add VQM24 Reference

In [5]:
results_df = pd.read_csv(processed_results_path / "parsed_results.csv")

# results_df["pairwise distance"] = results_df["pairwise distance"].apply(lambda x: np.array(json.loads(x)))

reference_data = np.load("reference_data/DFT_uniques.npz", allow_pickle=True)

In [6]:
print(results_df.head())

        molecule parameter  factor  parameter value  \
0  FNxOSiSxH2_82    gam3d2     1.5            0.375   
1   CNPSi2H4_113    gam3d2     1.5            0.375   
2   COPx2SyH4_72    gam3d2     1.5            0.375   
3     CN2PSH1_54    gam3d2     1.5            0.375   
4  C2SiSx2H8_221    gam3d2     1.5            0.375   

   atomisation energy (Hartrees)  atomisation energy (kcal/mol)  \
0                       1.360410                     853.669459   
1                       2.177525                    1366.416468   
2                       1.824700                    1145.015747   
3                       1.767131                    1108.890316   
4                       2.089762                    1311.344399   

   total energy (Hartrees)  total energy (kcal/mol)  max atomic force  
0                17.632961             11064.841710          0.019270  
1                12.990552              8151.688164          0.041210  
2                16.395235             10288.15749

In [7]:
results_df["VQM24 atomisation energy (Hartrees)"] = None
results_df["VQM24 atomisation energy (kcal/mol)"] = None
results_df["VQM24 total energy (Hartrees)"] = None
results_df["VQM24 total energy (kcal/mol)"] = None

results_df["VQM24 pairwise distance"] = None

for molecule in tqdm(
    results_df["molecule"].unique(),
    total=len(results_df["molecule"].unique()),
    desc="Adding VQM24 data",
):
    # reference compounds use SMIELS_num/conformer_num
    # results compounds use SMIELS_num_conformer_num
    vqm24_molecule = molecule.replace("_", "/", 2)
    vqm24_molecule = vqm24_molecule.replace("/", "_", 1)

    reference_index = np.where(reference_data["compounds"] == vqm24_molecule)

    # VQM24 energies are flipped
    reference_atomization_energy = -reference_data["Eatomization"][
        reference_index
    ].item()
    reference_total_energy = -reference_data["Etot"][reference_index].item()

    results_df.loc[
        results_df["molecule"] == molecule, "VQM24 atomisation energy (Hartrees)"
    ] = reference_atomization_energy
    results_df.loc[
        results_df["molecule"] == molecule, "VQM24 atomisation energy (kcal/mol)"
    ] = (reference_atomization_energy * HARTREE_TO_KCAL_MOL)

    results_df.loc[
        results_df["molecule"] == molecule, "VQM24 total energy (Hartrees)"
    ] = (reference_total_energy / HARTREE_TO_KCAL_MOL)
    results_df.loc[
        results_df["molecule"] == molecule, "VQM24 total energy (kcal/mol)"
    ] = reference_total_energy

    # Geometries
    xyz_df = molecules_path / f"{molecule}.xyz"
    pairwise_distances_arr = pairwise_distance(xyz_to_np(xyz_df)[0])

    mask = results_df["molecule"] == molecule

#     results_df.loc[mask, "VQM24 pairwise distance"] = [
#         json.dumps(pairwise_distances_arr.tolist())
#     ] * mask.sum()


# results_df["VQM24 pairwise distance"] = results_df["VQM24 pairwise distance"].apply(
#     lambda x: np.array(json.loads(x))
# )

results_df["Atomization Energy MAE (Hartrees)"] = abs(
    results_df["atomisation energy (Hartrees)"]
    - results_df["VQM24 atomisation energy (Hartrees)"]
)
results_df["Atomization Energy MAE (kcal/mol)"] = abs(
    results_df["atomisation energy (kcal/mol)"]
    - results_df["VQM24 atomisation energy (kcal/mol)"]
)

results_df["Total Energy MAE (Hartrees)"] = abs(
    results_df["total energy (Hartrees)"] - results_df["VQM24 total energy (Hartrees)"]
)
results_df["Total Energy MAE (kcal/mol)"] = abs(
    results_df["total energy (kcal/mol)"] - results_df["VQM24 total energy (kcal/mol)"]
)

# results_df["Pairwise Distance MAE"] = results_df.apply(
#     lambda row: average_abs_dist(
#         row["pairwise distance"], row["VQM24 pairwise distance"]
#     ),
#     axis=1,
# )

# results_df["Pairwise Distance Frobenius Norm"] = results_df.apply(
#     lambda row: np.linalg.norm(
#         row["pairwise distance"] - row["VQM24 pairwise distance"]
#     ),
#     axis=1,
# )

Adding VQM24 data: 100%|██████████| 88/88 [00:07<00:00, 11.35it/s]


In [8]:
results_df.head()

Unnamed: 0,molecule,parameter,factor,parameter value,atomisation energy (Hartrees),atomisation energy (kcal/mol),total energy (Hartrees),total energy (kcal/mol),max atomic force,VQM24 atomisation energy (Hartrees),VQM24 atomisation energy (kcal/mol),VQM24 total energy (Hartrees),VQM24 total energy (kcal/mol),VQM24 pairwise distance,Atomization Energy MAE (Hartrees),Atomization Energy MAE (kcal/mol),Total Energy MAE (Hartrees),Total Energy MAE (kcal/mol)
0,FNxOSiSxH2_82,gam3d2,1.5,0.375,1.36041,853.669459,17.632961,11064.84171,0.01927,0.866469,543.716929,1.463972,918.655654,,0.493941,309.95253,16.168989,10146.186056
1,CNPSi2H4_113,gam3d2,1.5,0.375,2.177525,1366.416468,12.990552,8151.688164,0.04121,1.135908,712.792304,1.618339,1015.522367,,1.041617,653.624164,11.372213,7136.165797
2,COPx2SyH4_72,gam3d2,1.5,0.375,1.8247,1145.015747,16.395235,10288.157496,0.063239,0.95014,596.221114,1.906659,1196.44547,,0.874561,548.794634,14.488576,9091.712026
3,CN2PSH1_54,gam3d2,1.5,0.375,1.767131,1108.890316,14.221133,8923.888776,0.074708,0.847112,531.570446,1.414629,887.692235,,0.920018,577.31987,12.806504,8036.196541
4,C2SiSx2H8_221,gam3d2,1.5,0.375,2.089762,1311.344399,16.199616,10165.404942,0.195071,1.495457,938.412543,1.859328,1166.744879,,0.594305,372.931857,14.340288,8998.660063


In [9]:
results_df.to_csv(processed_results_path / "with_reference_results.csv", index=False)

## Aggregate Results

In [10]:
results_df = pd.read_csv(processed_results_path / "with_reference_results.csv")

In [11]:
aggregated_results = (
    results_df.groupby(["parameter", "factor", "parameter value"])[
        [
            "Atomization Energy MAE (kcal/mol)",
            "Total Energy MAE (kcal/mol)",
            # "Pairwise Distance MAE",
            # "Pairwise Distance Frobenius Norm",
            "max atomic force",
        ]
    ]
    .mean()
    .reset_index()
)
aggregated_results.head()

Unnamed: 0,parameter,factor,parameter value,Atomization Energy MAE (kcal/mol),Total Energy MAE (kcal/mol),max atomic force
0,a1,0.5,0.26,450.637106,8436.561915,0.101604
1,a1,0.6,0.312,444.679268,8430.539782,0.101724
2,a1,0.7,0.364,440.632064,8426.327666,0.101787
3,a1,0.8,0.416,437.757067,8423.336414,0.101818
4,a1,0.9,0.468,435.704153,8421.181523,0.101833


In [12]:
aggregated_results.to_csv(
    processed_results_path / "aggregated_results.csv", index=False
)