In [1]:
%reload_ext autoreload
%autoreload 2

import h5py
N_Cs = 9

with h5py.File('../../dxtb/dxtb-gpu/gpu-cpu_analysis/rdkit/alkanes_data_500.hdf5', 'r') as f:
    for mol_name, data in f.items():
        if mol_name == f"alkane_{N_Cs}_carbons":
            atomic_numbers = data['atomic_numbers'][:]
            coordinates = data['coordinates'][:]

print(f"Number of carbon atoms in {mol_name}: {N_Cs}")
print(f"Nb of atoms: {len(atomic_numbers)}")

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '../../dxtb/dxtb-gpu/gpu-cpu_analysis/rdkit/alkanes_data_500.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

# Default methods

#### Vanilla


In [None]:
import dxtb
from dxtb._src.typing import DD
import torch
from ase.build import molecule
from dxtb.config import ConfigCache
from tqdm import tqdm

opts = {"scf_mode": "implicit", "batch_mode": 2, "int_driver": "libcint"}
batch_size = 64
results = {}

print(f"Number of carbon atoms in {mol_name}: {N_Cs}")
print(f"Nb of atoms: {len(atomic_numbers)}")
print(f"batch_size: {batch_size}")
print(f"opts: {opts}")

for i in tqdm(range(1)):
    for device in ["cuda:0", "cpu"]:
        # print(f"\nDevice: {device}")
        dd = {"dtype": torch.float32, "device": torch.device(device)}
        numbers = torch.tensor(atomic_numbers, device=dd["device"], dtype=torch.int32)
        positions = torch.tensor(coordinates, device=dd["device"], dtype=dd["dtype"])
        numbers = torch.stack([numbers] * batch_size)
        positions = torch.stack([positions] * batch_size).requires_grad_()
        charges = torch.zeros((batch_size,), device=dd["device"], dtype=dd["dtype"])

        calc = dxtb.Calculator(numbers, dxtb.GFN1_XTB, **dd, opts=opts, timer=True)
        calc.opts.cache = ConfigCache(enabled=False, density=True, fock=True, overlap=False)

        
        dxtb.timer.reset()
        e = calc.get_energy(positions, chrg=charges)
        dxtb.timer.start("Forces autograd")
        forces = torch.autograd.grad(sum(e), positions, retain_graph=True)[0]
        dxtb.timer.stop("Forces autograd")
        # dxtb.timer.print(v=0)

        # Store drv
        drv = calc.integrals.mgr.driver.drv
        # print(f"len(drv): {len(drv)}")

        results[device] = {
            "energy": e.detach().cpu(),
            "forces": forces.detach().cpu()
        }

# Compare results
energy_diff = (results["cuda:0"]["energy"] - results["cpu"]["energy"]).abs().max()
forces_diff = (results["cuda:0"]["forces"
] - results["cpu"]["forces"]).abs().max()

print(f"\n[Comparison]")
print(f"GPU energy: {results['cuda:0']['energy'].mean().item():.6e}")
print(f"CPU energy: {results['cpu']['energy'].mean().item():.6e}")
print(f"Max energy diff: {energy_diff.item():.6e}")
print(f"Max forces diff: {forces_diff.item():.6e}")

#### With drv

In [None]:
import dxtb
from dxtb._src.typing import DD
import torch
from ase.build import molecule
from dxtb.config import ConfigCache


torch.autograd.set_detect_anomaly(True)

for i in tqdm(range(10)):
    for device in ["cuda:0", "cpu"]:
        # print(f"\nDevice: {device}")
        dd = {"dtype": torch.float32, "device": torch.device(device)}
        numbers = torch.tensor(atomic_numbers, device=dd["device"], dtype=torch.int32)
        positions = torch.tensor(coordinates, device=dd["device"], dtype=dd["dtype"])
        numbers = torch.stack([numbers] * batch_size)
        positions = torch.stack([positions] * batch_size).requires_grad_()
        charges = torch.zeros((batch_size,), device=dd["device"], dtype=dd["dtype"])

        calc = dxtb.Calculator(numbers, dxtb.GFN1_XTB, **dd, opts=opts, timer=True)
        calc.opts.cache = ConfigCache(enabled=False, density=True, fock=True, overlap=False)
        calc.opts.exclude = "dispersion"

        
        dxtb.timer.reset()
        e = calc.get_energy(positions, chrg=charges, drv=drv)
        dxtb.timer.start("Forces autograd")
        
        forces = torch.autograd.grad(sum(e), positions, retain_graph=True)[0]

        dxtb.timer.stop("Forces autograd")
        # dxtb.timer.print(v=0)

        results[device] = {
            "energy": e.detach().cpu(),
            "forces": forces.detach().cpu()
        }

# Compare results
energy_diff = (results["cuda:0"]["energy"] - results["cpu"]["energy"]).abs().max()
forces_diff = (results["cuda:0"]["forces"
] - results["cpu"]["forces"]).abs().max()

print(f"\n[Comparison]")
print(f"GPU energy: {results['cuda:0']['energy'].mean().item():.6e}")
print(f"CPU energy: {results['cpu']['energy'].mean().item():.6e}")
print(f"Max energy diff: {energy_diff.item():.6e}")
print(f"Max forces diff: {forces_diff.item():.6e}")

In [None]:
import os
import sys

# Resolve the absolute path to the "orbspin" directory relative to the notebook's location
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
features_path = os.path.abspath(
    os.path.join(notebook_dir, "../../../../orbspin")
)

# Add the resolved path to sys.path if it's not already included
if os.path.isdir(features_path) and features_path not in sys.path:
    sys.path.append(features_path)

from features import generate_xtb_features_dxtb

for i in tqdm(range(5)):
    for device in ["cuda:0"]:
        # print(f"\nDevice: {device}")
        dd = {"dtype": torch.float32, "device": torch.device(device)}
        numbers = torch.tensor(atomic_numbers, device=dd["device"], dtype=torch.int32)
        positions = torch.tensor(coordinates, device=dd["device"], dtype=dd["dtype"])
        numbers = torch.stack([numbers] * batch_size)
        # positions = torch.stack([positions] * batch_size).requires_grad_()
        positions = torch.stack([positions] * batch_size)
        positions.requires_grad_(True)
        charges = torch.zeros((batch_size,), device=dd["device"], dtype=dd["dtype"])

        res = generate_xtb_features_dxtb(
            numbers,
            positions,
            res_ks=["energy", "forces", "scf_charges", "drv"],
        )

        


# Load Transition1x molecule

##### Process problematic molecules

In [2]:
import h5py
import torch

hdf5_path = "../../../../../data/Transition1x/data/transition1x.h5"
mol_names = ["C4H10N2O"]
rxns = ["8296_307", "8297_496", "8298_870", "8298_310", "8298_1038", "8297_881"]

poss = []
zs = []
with h5py.File(hdf5_path, "r") as f:
    for mol_name in mol_names:
        for rxn in rxns:
            rxn_nb = rxn.split("_")[0]
            rxn_idx = rxn.split("_")[1]
            entry = f[f"val/{mol_name}/rxn{rxn_nb}"]
            poss.append(torch.tensor(entry["positions"][int(rxn_idx)]))
            zs.append(torch.tensor(entry["atomic_numbers"]))


In [3]:
import os
import sys
from tqdm import tqdm

# Resolve the absolute path to the "orbspin" directory relative to the notebook's location
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
features_path = os.path.abspath(
    os.path.join(notebook_dir, "../../../../orbspin")
)
util_path = os.path.abspath(
    os.path.join(notebook_dir, "../../../../..")
)

# Add the resolved path to sys.path if it's not already included
if os.path.isdir(features_path) and features_path not in sys.path:
    sys.path.append(features_path)

from features import generate_xtb_features_dxtb
from util.utilities import get_unit_conversion

for i in tqdm(range(5)):
    for device in ["cuda:0"]:
        # print(f"\nDevice: {device}")
        dd = {"dtype": torch.float32, "device": torch.device(device)}

        numbers = torch.stack(zs).to(device=dd["device"], dtype=torch.int32)
        positions = torch.stack(poss).to(device=dd["device"], dtype=dd["dtype"]) * get_unit_conversion("angstrom", "bohr")
        positions.requires_grad_(True)

        res = generate_xtb_features_dxtb(
            numbers,
            positions,
            res_ks=["energy", "forces", "scf_charges", "drv"],
        )
                
        # for i, en in enumerate(res["energy"]):
        #     print(f"{mol_name}_{rxns[i]}, energy[{i}]: {en.item()}")


  0%|          | 0/5 [00:00<?, ?it/s]


RuntimeError: The size of tensor a (6) must match the size of tensor b (2) at non-singleton dimension 0

##### Process the whole dataset

In [4]:
import os
import sys
import h5py
import torch
from torch.utils.data import IterableDataset, DataLoader
from tqdm import tqdm

# Resolve the absolute path to the "orbspin" directory relative to the notebook's location
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
features_path = os.path.abspath(
    os.path.join(notebook_dir, "../../../../orbspin")
)
util_path = os.path.abspath(
    os.path.join(notebook_dir, "../../../../..")
)

# Add the resolved path to sys.path if it's not already included
if os.path.isdir(features_path) and features_path not in sys.path:
    sys.path.append(features_path)

from features import generate_xtb_features_dxtb
from util.utilities import get_unit_conversion

import dxtb
from dxtb.config import ConfigCache
from dxtb import OutputHandler

dd = {"dtype": torch.float32, "device": torch.device("cuda:0")}

class TransitionBatchDataset(IterableDataset):
    def __init__(self, hdf5_path, split="val", batch_size=64, mol_names=None):
        super().__init__()
        self.hdf5_path = hdf5_path
        self.split = split
        self.batch_size = batch_size
        self.mol_names = mol_names

    def __iter__(self):
        with h5py.File(self.hdf5_path, "r") as f:
            for mol_name in list(self.mol_names or f[f"{self.split}"].keys()):
                mol_group = f[f"{self.split}/{mol_name}"]
                for rxn_name in mol_group.keys():
                    rxn_group = mol_group[rxn_name]
                    positions = rxn_group["positions"]
                    zs = rxn_group["atomic_numbers"][()]
                    n_samples = len(positions)

                    for i in range(0, n_samples, self.batch_size):
                        pos_batch = torch.tensor(positions[i:i+self.batch_size], **dd) * get_unit_conversion("angstrom", "bohr")
                        pos_batch.requires_grad_(True)
                        z_batch = torch.tensor([zs] * len(pos_batch), device=dd["device"])  # [B, N]
                        yield {
                            "mol_name": mol_name,
                            "rxn_name": rxn_name,
                            "z": z_batch,
                            "pos": pos_batch,
                            "batch_size": len(pos_batch)
                        }

# Create dataset + dataloader
dataset = TransitionBatchDataset(
    hdf5_path="../../../../../data/Transition1x/data/transition1x.h5",
    batch_size=64,
    mol_names=None
)
dataloader = DataLoader(dataset, batch_size=None)

# Wrap in tqdm and track sample count
sample_count = 0
pbar = tqdm(dataloader, desc="Processing", unit=" datapoints")


i = 0
for batch in pbar:
    i += 1
    # if i <53:
    #     continue
    sample_count += batch["batch_size"]
    pbar.set_description(f"{batch['mol_name']}/{batch['rxn_name']}")
    pbar.set_postfix(total=sample_count)
    
    # print(f"z {batch['z']}")
    # print(f"pos {batch['pos']}")

    # DXTB CALC
    dd = {"dtype": torch.float32, "device": torch.device("cuda:0")}
    opts = {"scf_mode": "implicit", "batch_mode": 2, "int_driver": "libcint"}

    batch_size = batch['z'].shape[0]
    charges = torch.full((batch_size,), 0, **dd)
    spin = torch.full((batch_size,), 0, **dd)

    calc = dxtb.Calculator(batch['z'], dxtb.GFN1_XTB, **dd, opts=opts)

    e = calc.get_energy(batch['pos'], chrg=charges, spin=spin, scf_charges=None)
    forces = torch.autograd.grad(sum(e), batch['pos'], retain_graph=True)[0]
    

    # Features calc
    res = generate_xtb_features_dxtb(
        batch["z"],
        batch["pos"],
        charge=charges,
        spin=spin,
        res_ks=["energy", "forces"],
    )


    

  z_batch = torch.tensor([zs] * len(pos_batch), device=dd["device"])  # [B, N]
C2H2N2O/rxn2091: : 0 datapoints [00:00, ? datapoints/s, total=64]

C2H2N2O/rxn2091: : 2 datapoints [00:01,  1.58 datapoints/s, total=192]


RuntimeError: could not compute gradients for some functions