In [1]:
import json
import pandas as pd
pd.options.display.float_format = '{:,.3f}'.format
import os
import re
import rdkit
#from rdkit import IPythonConsole
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem import rdDistGeom
from rdkit.Chem import rdMolAlign
print(rdkit.__version__)

2025.03.6


In [2]:
def extract_XTB_energy_data_and_align(directory):
    all_data = []
    for subdir in os.listdir(directory):
        subdir_path = os.path.join(directory, subdir)
        if os.path.isdir(subdir_path):
            data = None
            mol = None
            # collect data and mol first, then append only if both are present
            for filename in os.listdir(subdir_path):
                if filename.endswith('.json'):
                    filepath = os.path.join(subdir_path, filename)
                    with open(filepath, 'r') as file:
                        try:
                            data = json.load(file)
                        except json.JSONDecodeError:
                            data = None
                elif filename == 'xtbopt.sdf' or filename == 'bioactive_min.sdf':
                    sdf_path = os.path.join(subdir_path, filename)
                    supplier = Chem.SDMolSupplier(sdf_path)
                    if supplier:
                        mol = supplier[0]
            if mol is not None and data is not None:
                all_data.append({
                    'rdkit mol object (conf.)': mol,
                    'XTB opt convergence criteria': str(subdir),
                    'XTB energy / Hartree': data.get("total energy"),
                    #'HOMO_LUMO_gap (eV)': data.get("HOMO-LUMO gap / eV")
                })
    df = pd.DataFrame(all_data)
    df.sort_values(by='XTB energy / Hartree', ascending=True, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['rdkit mol object (conf.)'].apply(lambda x: rdMolAlign.AlignMol(x, df['rdkit mol object (conf.)'][0]))
    for i in range(0, len(df)):
        with Chem.SDWriter(f'aligned_{df["XTB opt convergence criteria"][i]}.sdf') as writer:
            writer.write(df['rdkit mol object (conf.)'][i])
    df['aligned best rms / Å'] = df['rdkit mol object (conf.)'].apply(lambda x: rdMolAlign.GetBestRMS(x, df['rdkit mol object (conf.)'][0]))
    df['relative XTB energy (plus ALPB) / kcal/mol'] = df['XTB energy / Hartree'] * 627.509 - df['XTB energy / Hartree'][0] * 627.509
    df.drop(columns=['XTB energy / Hartree', 'rdkit mol object (conf.)'], inplace=True)
    return df

df = extract_XTB_energy_data_and_align('XTB_OPT')
df

Unnamed: 0,XTB opt convergence criteria,aligned best rms / Å,relative XTB energy (plus ALPB) / kcal/mol
0,extreme,0.0,0.0
1,vtight,0.042,0.003
2,tight,0.791,0.471
3,normal,0.777,0.872
4,lax,1.077,2.039
5,loose,1.23,2.673
6,sloppy,1.235,2.748
7,crude,1.264,3.471
8,ff_original,1.31,35.625


In [7]:
def extract_orca_final_energy(filepath):

    final_pattern = r'&FinalEnergy\s+\[&Type "Double"\]\s*(-?\d+\.\d+(?:[Ee][+\-]?\d+)?)\s*"Final single point energy"'

    cpcm_pattern = r'&CPCMDielEnergy\s+\[&Type "Double"\]\s*(-?\d+\.\d+(?:[Ee][+\-]?\d+)?)'

    with open(filepath, 'r') as f:
        content = f.read()
        match = re.search(cpcm_pattern, content)
        match_1 = re.search(final_pattern, content)
        energy_str = match_1.group(1)
        energy_cpcm_str = match.group(1)
        energy = float(energy_str)
        energy_cpcm = float(energy_cpcm_str)
        energy_exclude_cpcm = energy - energy_cpcm
        return energy, energy_exclude_cpcm

In [10]:
def_energy_dict = {}
for filename in os.listdir('DFT_SPE_NormalSCF'):
    if filename.endswith('.property.txt'):
        filepath = os.path.join('DFT_SPE_NormalSCF', filename)
        basename = os.path.splitext(filename)[0].replace('.property', '')
        # Extract DFT Energy (CPCM)
        energy, energy_exclude_cpcm = extract_orca_final_energy(filepath)
        def_energy_dict[basename] = (energy, energy_exclude_cpcm)

#
df_energy = pd.DataFrame.from_dict(def_energy_dict, orient='index', columns=['DFT SPE (CPCM)', 'DFT SPE (exclude CPCM)'])
df_energy.reset_index(inplace=True)
df_energy.rename(columns={'index': 'XTB opt convergence criteria'}, inplace=True)
df_energy.sort_values(by='DFT SPE (CPCM)', ascending=True, inplace=True)
df_energy.reset_index(drop=True, inplace=True)
gold_standard_energy = df_energy['DFT SPE (CPCM)'][0] * 627.509
print(gold_standard_energy)
df_energy['relative DFT SPE (plus CPCM) / kcal/mol'] = df_energy['DFT SPE (CPCM)'] * 627.509 - gold_standard_energy
df_energy.drop(columns=['DFT SPE (CPCM)', 'DFT SPE (exclude CPCM)'], inplace=True)
df_energy.loc[df_energy['XTB opt convergence criteria'] == 'spe', 'XTB opt convergence criteria'] = 'ff_original'
df_energy

-2095858.5370634734


Unnamed: 0,XTB opt convergence criteria,relative DFT SPE (plus CPCM) / kcal/mol
0,extreme,0.0
1,vtight,0.038
2,normal,1.261
3,tight,1.561
4,loose,1.734
5,sloppy,1.876
6,lax,2.036
7,crude,2.238
8,ff_original,20.456


In [11]:
def_energy_dict = {}
for filename in os.listdir('DFT_SPE_LooseSCF'):
    if filename.endswith('.property.txt'):
        filepath = os.path.join('DFT_SPE_LooseSCF', filename)
        basename = os.path.splitext(filename)[0].replace('.property', '')
        # Extract DFT Energy (CPCM)
        energy, energy_exclude_cpcm = extract_orca_final_energy(filepath)
        def_energy_dict[basename] = (energy, energy_exclude_cpcm)

#
df_energy = pd.DataFrame.from_dict(def_energy_dict, orient='index', columns=['DFT SPE (CPCM)', 'DFT SPE (exclude CPCM)'])
df_energy.reset_index(inplace=True)
df_energy.rename(columns={'index': 'XTB opt convergence criteria'}, inplace=True)
df_energy.sort_values(by='DFT SPE (CPCM)', ascending=True, inplace=True)
df_energy.reset_index(drop=True, inplace=True)
print(df_energy['DFT SPE (CPCM)'][0] * 627.509 - gold_standard_energy)
df_energy['relative DFT SPE (plus CPCM) / kcal/mol'] = df_energy['DFT SPE (CPCM)'] * 627.509 - gold_standard_energy
df_energy.drop(columns=['DFT SPE (CPCM)', 'DFT SPE (exclude CPCM)'], inplace=True)
df_energy.loc[df_energy['XTB opt convergence criteria'] == 'spe', 'XTB opt convergence criteria'] = 'ff_original'
df_energy

0.0873351579066366


Unnamed: 0,XTB opt convergence criteria,relative DFT SPE (plus CPCM) / kcal/mol
0,extreme,0.087
1,vtight,0.124
2,normal,1.35
3,tight,1.647
4,loose,1.818
5,sloppy,1.96
6,lax,2.123
7,crude,2.321
8,ff_original,20.528


In [93]:
df_all = pd.merge(df, df_energy, on='XTB opt convergence criteria', how='left')
df_all

Unnamed: 0,XTB opt convergence criteria,aligned best rms / Å,relative XTB energy (plus ALPB) / kcal/mol,relative DFT SPE (plus CPCM) / kcal/mol
0,extreme,0.0,0.0,0.0
1,vtight,0.042,0.003,0.037
2,tight,0.791,0.471,1.56
3,normal,0.777,0.872,1.263
4,lax,1.077,2.039,2.036
5,loose,1.23,2.673,1.73
6,sloppy,1.235,2.748,1.873
7,crude,1.264,3.471,2.234
8,ff_original,1.31,35.625,20.44
