# Analyze Chemical Space

In [3]:
import importlib
import os
import sys
import statistics

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch

import helper
from modelforge.dataset.dataset import initialize_datamodule
from modelforge.dataset.utils import RandomRecordSplittingStrategy, SplittingStrategy

sns.set_theme()
sns.set_context("notebook")

#importlib.reload(helper)

In [8]:
# initialize datasets as in training
dm = initialize_datamodule(
    dataset_name="spice2",
    version_select="full_dataset_HCNOF_v1.1",
    local_cache_dir="../cache",
    dataset_cache_dir="./dataset_cache",
    properties_of_interest=["atomic_numbers", "positions", "dft_total_energy", "dft_total_force", "total_charge", "scf_dipole"],
    properties_assignment={
        "atomic_numbers": "atomic_numbers",
        "positions": "positions",
        "E": "dft_total_energy",
        "F": "dft_total_force",
        "total_charge": "total_charge",
        "dipole_moment": "scf_dipole"
    }
)

[32m2025-10-01 19:02:33.805[0m | [34m[1mDEBUG   [0m | [36mmodelforge.dataset.dataset[0m:[36m__init__[0m:[36m327[0m - [34m[1mLoading config data from /Users/syan/workdir/modelforge/modelforge/dataset/yaml_files/spice2.yaml[0m
[32m2025-10-01 19:02:33.830[0m | [1mINFO    [0m | [36mmodelforge.dataset.dataset[0m:[36m__init__[0m:[36m367[0m - [1mUsing dataset version full_dataset_HCNOF_v1.1[0m
[32m2025-10-01 19:02:33.832[0m | [34m[1mDEBUG   [0m | [36mmodelforge.dataset.dataset[0m:[36m_file_validation[0m:[36m721[0m - [34m[1mFile ./dataset_cache/spice_2_dataset_v1.1_HCNOF.hdf5 does not exist.[0m
[32m2025-10-01 19:02:33.833[0m | [34m[1mDEBUG   [0m | [36mmodelforge.dataset.dataset[0m:[36m_acquire_dataset[0m:[36m461[0m - [34m[1mhdf5 file spice_2_dataset_v1.1_HCNOF.hdf5 not found.[0m
[32m2025-10-01 19:02:33.921[0m | [34m[1mDEBUG   [0m | [36mmodelforge.utils.remote[0m:[36mdownload_from_url[0m:[36m195[0m - [34m[1mDownloading datafile 

Validating file ./dataset_cache/spice_2_dataset_v1.1_HCNOF.hdf5


downloading: 100%|##########| 22875180/22875180 [16:54<00:00, 22548.19it/s]
[32m2025-10-01 19:20:22.700[0m | [34m[1mDEBUG   [0m | [36mmodelforge.dataset.dataset[0m:[36m_from_hdf5[0m:[36m795[0m - [34m[1mReading data from ./dataset_cache/spice_2_dataset_v1.1_HCNOF.hdf5[0m
[32m2025-10-01 19:20:22.701[0m | [34m[1mDEBUG   [0m | [36mmodelforge.dataset.dataset[0m:[36m_from_hdf5[0m:[36m796[0m - [34m[1melement filter: None[0m
[32m2025-10-01 19:20:24.568[0m | [34m[1mDEBUG   [0m | [36mmodelforge.dataset.dataset[0m:[36m_from_hdf5[0m:[36m828[0m - [34m[1mProperties of Interest: ['atomic_numbers', 'positions', 'dft_total_energy', 'dft_total_force', 'total_charge', 'scf_dipole'][0m
[32m2025-10-01 19:20:24.569[0m | [34m[1mDEBUG   [0m | [36mmodelforge.dataset.dataset[0m:[36m_from_hdf5[0m:[36m834[0m - [34m[1mn_entries: 57037[0m
100%|██████████| 57037/57037 [02:01<00:00, 469.61it/s]
[32m2025-10-01 19:22:26.319[0m | [34m[1mDEBUG   [0m | [36mmode

Validating file ../cache/spice2.npz


Process dataset: 100%|██████████| 928073/928073 [00:05<00:00, 181220.01it/s]
Calculating pairlist for dataset: 100%|██████████| 4641/4641 [00:20<00:00, 231.00it/s]
[32m2025-10-01 19:22:54.868[0m | [1mINFO    [0m | [36mmodelforge.dataset.utils[0m:[36mcalculate_mean_and_variance[0m:[36m167[0m - [1mCalculating mean and variance of atomic energies[0m
100%|██████████| 1813/1813 [00:09<00:00, 186.34it/s]
[32m2025-10-01 19:23:04.602[0m | [1mINFO    [0m | [36mmodelforge.dataset.utils[0m:[36mcalculate_mean_and_variance[0m:[36m179[0m - [1mMean and standard deviation of the dataset:{'per_atom_energy_mean': tensor(-430.6900, dtype=torch.float64), 'per_atom_energy_stddev': tensor(42.4628, dtype=torch.float64)}[0m
[32m2025-10-01 19:23:04.603[0m | [1mINFO    [0m | [36mmodelforge.dataset.dataset[0m:[36mprepare_data[0m:[36m1284[0m - [1m{'atomic_self_energies': {'B': <Quantity(-24.6715205, 'hartree')>, 'Br': <Quantity(-2574.11672, 'hartree')>, 'C': <Quantity(-37.872645

## Plotting

In [14]:
vars(dm)

{'_log_hyperparams': False,
 'prepare_data_per_node': True,
 'allow_zero_length_dataloader_with_multiple_devices': False,
 'trainer': None,
 'name': 'spice2',
 'batch_size': 64,
 'splitting_strategy': <modelforge.dataset.utils.FirstComeFirstServeSplittingStrategy at 0x341e35850>,
 'remove_self_energies': True,
 'shift_center_of_mass_to_origin': False,
 'dict_atomic_self_energies': None,
 'regression_ase': False,
 'force_download': False,
 'regenerate_processed_dataset': False,
 'version_select': 'full_dataset_HCNOF_v1.1',
 'train_dataset': <torch.utils.data.dataset.Subset at 0x343259f10>,
 'val_dataset': <torch.utils.data.dataset.Subset at 0x3d0a1f5f0>,
 'test_dataset': <torch.utils.data.dataset.Subset at 0x49009fec0>,
 'properties_of_interest': ['atomic_numbers',
  'positions',
  'dft_total_energy',
  'dft_total_force',
  'total_charge',
  'scf_dipole'],
 'properties_assignment': {'atomic_numbers': 'atomic_numbers',
  'positions': 'positions',
  'E': 'dft_total_energy',
  'F': 'dft_to

In [23]:
vars(dm.torch_dataset)

{'preloaded': False,
 'properties_of_interest': {'atomic_numbers': tensor([8, 8, 6,  ..., 1, 1, 1], dtype=torch.int32),
  'positions': tensor([[ 0.4113, -0.2958,  0.0557],
          [ 0.7071,  0.1196,  0.0680],
          [ 0.4835, -0.2891,  0.1545],
          ...,
          [ 0.4428, -0.2938, -0.1836],
          [-0.1426, -0.5384,  0.1532],
          [-0.2655, -0.4963,  0.2526]]),
  'E': tensor([[-27507.4269],
          [-27497.7733],
          [-27488.4544],
          ...,
          [-29707.2108],
          [-29707.5980],
          [-29613.8078]], dtype=torch.float64),
  'total_charge': tensor([[0],
          [0],
          [0],
          ...,
          [0],
          [0],
          [0]], dtype=torch.int32),
  'F': tensor([[  249.9994,   496.8468,    97.5443],
          [ -352.4402,  -201.2924,   251.3811],
          [-1088.0614, -1128.3499,  -807.4728],
          ...,
          [ 1188.5764,  1633.7742,  1061.7694],
          [ -236.1393,   380.6429,   776.6714],
          [  588.8116

In [35]:
dm.torch_dataset.properties_of_interest["total_charge"].numpy().reshape(-1).mean()

0.0