In [2]:
%reload_ext autoreload
%autoreload 2

import h5py
from tqdm import tqdm

from qcm_ml.features.xtb import generate_xtb_matrices_fpsh, get_2body_grads

src_path = "/home/beom/orbnet_mhd/qcm_ml_b/qcm_ml/data/rmd17/rmd17_aspirin.hdf5"
tgt_path = "/home/beom/orbnet_mhd/qcm_ml_b/qcm_ml/data/rmd17/rmd17_aspirin_DXTB.hdf5"

def generate_hdf5(max_molecules=int(1e6)):
    idx = 0
    with h5py.File(src_path, 'r') as src_file:
        with h5py.File(tgt_path, 'w') as tgt_file:
            for mode in src_file:  # e.g., "train", "test"
                for mol in src_file[mode]:  # e.g., "aspirin"
                    for geo_idx in tqdm(src_file[f"{mode}/{mol}"], desc=f"{mode}/{mol} geometries"):  # Iterate over geometry indices (e.g., "10097", "10139")
                        idx += 1
                        if idx > max_molecules:
                            break
                        src_geo_group = src_file[f"{mode}/{mol}/{geo_idx}"]
                        tgt_geo_group = tgt_file.create_group(f"{mode}/{mol}/{geo_idx}")

                        for key, item in src_geo_group.items():
                            # Skip 2body and 2body_grad groups
                            if key in ["2body", "2body_grad"]:
                                continue
                            else: 
                                if item.shape == ():
                                    tgt_geo_group.create_dataset(key, data=item[()])
                                else:
                                    tgt_geo_group.create_dataset(key, data=item[:])

                        # Calculation of 2body features
                        numbers = src_geo_group["atomic_numbers"][:] 
                        positions = src_geo_group["geometry_bohr"][:]

                        T = generate_xtb_matrices_fpsh(
                            calculator="dxtb",
                            element_numbers=numbers,
                            coordinates=positions,
                            spin=0,
                            spin_pol=False,
                            get_energy=True,
                            get_forces=True,
                            )
                        
                        # grads = get_2body_grads(element_numbers=numbers, coordinates=positions)
                        
                        for M in ["F", "P", "S", "H"]:
                            tgt_geo_group.create_dataset(f"2body/{M}", data=T[M])
                            tgt_geo_group.create_dataset(f"2body_grad/{M}", data=T[f"grad_{M}"])

                        tgt_geo_group.create_dataset("energy_delta_dxtb_Ha")

                        
                        
                            
generate_hdf5(max_molecules=1)

train/aspirin geometries:   0%|          | 1/1000 [01:07<18:40:11, 67.28s/it]
val/aspirin geometries:   0%|          | 0/1000 [00:00<?, ?it/s]


# Profiler

In [2]:
import cProfile
import pstats
import io

pr = cProfile.Profile()
pr.enable()

generate_hdf5(10)

pr.disable()
s = io.StringIO()
sortby = 'cumtime'  # Sort by cumulative time
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())

train/aspirin geometries:   0%|          | 0/1000 [00:00<?, ?it/s]

train/aspirin geometries:   1%|          | 11/1000 [06:43<10:04:43, 36.69s/it]
val/aspirin geometries:   0%|          | 0/1000 [00:00<?, ?it/s]

         106182816 function calls (105566975 primitive calls) in 403.581 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000  403.584  201.792 /home/beom/anaconda3/envs/orbnet_tblite/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3514(run_code)
        2    0.000    0.000  403.584  201.792 {built-in method builtins.exec}
        1    0.000    0.000  403.584  403.584 /tmp/ipykernel_22794/3008304535.py:8(<module>)
        1    0.014    0.014  403.584  403.584 /tmp/ipykernel_22794/2221312473.py:12(generate_hdf5)
       11    0.004    0.000  403.387   36.672 /home/beom/orbnet_mhd/qcm_ml_b/qcm_ml/features/xtb.py:102(generate_xtb_matrices_fpsh)
       11    0.127    0.012  403.354   36.669 /home/beom/orbnet_mhd/qcm_ml_b/qcm_ml/features/xtb.py:212(generate_xtb_matrices_dxtb)
       44    2.703    0.061  401.496    9.125 /home/beom/anaconda3/envs/orbnet_tblite/lib/python3.9/site-packages/torc


