In [1]:
import nshtrainer.ll as ll

ll.pretty()

In [2]:
from pathlib import Path

base_path = Path("/mnt/datasets/mptrj-mace/mptrj-gga-ggapu/")
base_path

Path('/mnt/datasets/mptrj-mace/mptrj-gga-ggapu')

In [3]:
extxyz_files = list(base_path.glob("**/*.extxyz"))
print(len(extxyz_files))

145923


In [4]:
import ase.io

f = extxyz_files[51]
atoms_list = ase.io.read(f, index=":")
print(type(atoms_list), len(atoms_list))

<class 'list'> 7


In [14]:
from collections import defaultdict
from typing import Any

import numpy as np
import rich

props = defaultdict[str, list[Any]](lambda: [])

for atoms in atoms_list:
    dict_ = atoms.todict()
    dict_.update(dict_.pop("info"))
    for k, v in dict_.items():
        if not isinstance(v, np.ndarray):
            v = np.array(v)
        props[k].append(v)

d = {k: np.stack(v) for k, v in props.items()}
d = {k: v.tolist() if k in ("mp_id", "task_id") else v for k, v in d.items()}
rich.print(atoms_list[0].todict())
rich.print(d)

In [7]:
from collections.abc import Sequence

import datasets


def atoms_to_dict(atoms):
    d = atoms.todict()
    info = d.pop("info")
    return {**d, **info}


def generator(file_path: Path):
    atoms_list = ase.io.read(file_path, index=":")
    assert isinstance(
        atoms_list, Sequence
    ), f"atoms_list is not a sequence: {type(atoms_list)}"

    for i, atoms in enumerate(atoms_list):
        d = atoms_to_dict(atoms)
        d["filename"] = str(file_path.stem)
        d["extxyz_id"] = i

        # Pop magmom as it is not present in all files
        d.pop("magmoms", None)
        d.pop("bandgap", None)

        assert d["forces"].ndim == 2
        assert d["positions"].ndim == 2
        assert d["cell"].ndim == 2
        assert d["stress"].ndim == 2

        d["num_atoms"] = d["numbers"].shape[0]

        yield d


def generator_all(shards: list[Path]):
    for file_path in shards:
        yield from generator(file_path)


dataset = datasets.Dataset.from_generator(
    generator_all,
    gen_kwargs={"shards": extxyz_files},
    num_proc=32,
    features=datasets.Features(
        {
            # "numbers": datasets.Array2D(shape=(None, 1), dtype="int64"),
            "numbers": datasets.Sequence(datasets.Value("int64")),
            # "positions": datasets.Array2D(shape=(None, 3), dtype="float32"),
            "positions": datasets.Sequence(
                datasets.Sequence(datasets.Value("float32"))
            ),
            # "forces": datasets.Array2D(shape=(None, 3), dtype="float32"),
            "forces": datasets.Sequence(datasets.Sequence(datasets.Value("float32"))),
            "cell": datasets.Array2D(shape=(3, 3), dtype="float32"),
            "pbc": datasets.Sequence(datasets.Value("bool")),
            "energy": datasets.Value("float32"),
            "stress": datasets.Array2D(shape=(3, 3), dtype="float32"),
            "e_per_atom_relaxed": datasets.Value("float32"),
            "mp_id": datasets.Value("string"),
            "energy_per_atom": datasets.Value("float32"),
            "ef_per_atom_relaxed": datasets.Value("float32"),
            "corrected_total_energy": datasets.Value("float32"),
            "ef_per_atom": datasets.Value("float32"),
            "task_id": datasets.Value("string"),
            "calc_id": datasets.Value("int64"),
            "ionic_step": datasets.Value("int64"),
            "filename": datasets.Value("string"),
            "extxyz_id": datasets.Value("int64"),
            "num_atoms": datasets.Value("int64"),
        }
    ),
)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/32 [00:00<?, ?it/s]

Dataset({
    features: ['numbers', 'positions', 'forces', 'cell', 'pbc', 'energy', 'stress', 'e_per_atom_relaxed', 'mp_id', 'energy_per_atom', 'ef_per_atom_relaxed', 'corrected_total_energy', 'ef_per_atom', 'task_id', 'calc_id', 'ionic_step', 'filename', 'extxyz_id', 'num_atoms'],
    num_rows: 1580395
})

In [18]:
d_test = dataset.train_test_split(test_size=0.0065)
train, test = d_test["train"], d_test["test"]

d_val = train.train_test_split(test_size=0.0065)
train, val = d_val["train"], d_val["test"]

rich.print({"train": train, "val": val, "test": test})

In [20]:
ddict = datasets.DatasetDict({"train": train, "val": val, "test": test})
rich.print(ddict)

ddict.push_to_hub("nimashoghi/mptrj", private=True)

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/312 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/312 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/312 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/312 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/312 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nimashoghi/mptrj/commit/e75795f0c9f0c0ed44bcf8012689f5fc180c19d4', commit_message='Upload dataset', commit_description='', oid='e75795f0c9f0c0ed44bcf8012689f5fc180c19d4', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
# Split dataset into train, validation and test
train, test = dataset.train_test_split(test_size=0.1)
train, val = train.train_test_split(test_size=0.1)

rich.print({"train": train, "val": val, "test": test})

AttributeError: 'str' object has no attribute 'train_test_split'

In [None]:
dt = dataset.with_format("torch")
rich.print(dt[0])

rich.print(dt.features)

In [None]:
dataset.push_to_hub("nimashoghi/mptrj", private=True)