Skip to content

Commit

Permalink
Add ASE ingester and generalize other ingestion utilities (#1509)
Browse files Browse the repository at this point in the history
* Add `ingest_from` method to adapters and generalize ingester tests

* Add several utility functions for structures, adapters and relevant tests

- Update test error messages for new validator

* Add ASE ingester

* Add implicit-type `ingest_from` method

* Add README note about adapters

* Handling of ASE custom metadata fields and provider fields in ASE adapters
  • Loading branch information
ml-evs committed Feb 14, 2023
1 parent 7c8f9a6 commit ad8a9fa
Show file tree
Hide file tree
Showing 14 changed files with 358 additions and 88 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
<img width="100px" align="center" src="https://matsci.org/uploads/default/original/2X/b/bd2f59b3bf14fb046b74538750699d7da4c19ac1.svg">
</div>

<h1 align="center">
OPTIMADE Python tools
</h1>

# <div align="center">OPTIMADE Python tools</div>

<div align="center">

Expand Down Expand Up @@ -50,6 +47,7 @@ This is to enable interoperability among databases that serve crystal structures
This repository contains a library of tools for implementing and consuming [OPTIMADE APIs](https://www.optimade.org) using Python:

1. [pydantic](https://github.com/pydantic/pydantic) data models for all [OPTIMADE entry types](https://www.optimade.org/optimade-python-tools/latest/all_models/) and endpoint responses, and a [Lark](https://github.com/lark-parser/lark) [EBNF grammar](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) implementation for the OPTIMADE filter language.
1. Adapters to map OPTIMADE data to and from many commonly used atomistic Python frameworks (e.g., [pymatgen](https://pymatgen.org/), [ASE](https://wiki.fysik.dtu.dk/ase/)) and crystallographic file types (e.g., [CIF](https://www.iucr.org/resources/cif)), using the `optimade.adapters` module.
1. A configurable reference server implementation that can make use of either MongoDB or Elasticsearch database backends out-of-the-box, and is readily extensible to other backends. Try it out on the [demo site](https://optimade.fly.dev)! The OpenAPI schemas of the server are used to construct the [OPTIMADE schemas](https://schemas.optimade.org/) site.
1. An [OPTIMADE client](https://www.optimade.org/optimade-python-tools/latest/getting_started/client/) (`optimade-get`) that can query multiple [OPTIMADE providers](https://optimade.org/providers-dashboard) concurrently with a given filter, at the command-line or from Python code.
1. A fuzzy API validator tool, which may be called from the shell (`optimade-validator`) or used as a GitHub Action from [optimade-validator-action](https://github.com/Materials-Consortia/optimade-validator-action); this validator is used to construct the [providers dashboard](https://optimade.org/providers-dashboard).
Expand Down
46 changes: 46 additions & 0 deletions optimade/adapters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class EntryAdapter:
Attributes:
ENTRY_RESOURCE: Entry resource to store entry as.
_type_converters: Dictionary of valid conversion types for entry.
_type_ingesters: Dictionary of valid ingestion types mapped to ingestion functions.
_type_ingesters_by_type: Dictionary mapping the keys of `_type_ingesters` to data
types that can be ingested.
as_<_type_converters>: Convert entry to a type listed in `_type_converters`.
from_<_type_converters>: Convert an external type to the corresponding OPTIMADE model.
Expand All @@ -42,6 +45,7 @@ class EntryAdapter:
ENTRY_RESOURCE: Type[EntryResource] = EntryResource
_type_converters: Dict[str, Callable] = {}
_type_ingesters: Dict[str, Callable] = {}
_type_ingesters_by_type: Dict[str, Type] = {}

def __init__(self, entry: dict) -> None:
"""
Expand Down Expand Up @@ -116,6 +120,48 @@ def convert(self, format: str) -> Any:

return self._converted[format]

@classmethod
def ingest_from(cls, data: Any, format: Optional[str] = None) -> Any:
"""Convert desired format to OPTIMADE format.
Parameters:
data (Any): The data to convert.
format (str): Type or format to which the entry should be converted.
Raises:
AttributeError: If `format` can not be found in `_type_ingesters`.
Returns:
The ingested Structure.
"""

if format is None:
for key, instance_type in cls._type_ingesters_by_type.items():
if isinstance(data, instance_type):
format = key
break

else:
raise AttributeError(
f"Non entry type to data of type {type(data)} from.\n"
f"Valid entry types: {tuple(cls._type_ingesters.keys())}"
)

if format not in cls._type_ingesters:
raise AttributeError(
f"Non-valid entry type to ingest from: {format}\n"
f"Valid entry types: {tuple(cls._type_ingesters.keys())}"
)

return cls(
{
"attributes": cls._type_ingesters[format](data).dict(),
"id": "",
"type": "structures",
}
)

@staticmethod
def _get_model_attributes(
starting_instances: Union[Tuple[BaseModel, ...], List[BaseModel]], name: str
Expand Down
10 changes: 9 additions & 1 deletion optimade/adapters/structures/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from optimade.models import StructureResource

from .aiida import get_aiida_structure_data
from .ase import get_ase_atoms
from .ase import Atoms as ASEAtoms
from .ase import from_ase_atoms, get_ase_atoms
from .cif import get_cif
from .jarvis import get_jarvis_atoms
from .proteindatabank import get_pdb, get_pdbx_mmcif
from .pymatgen import Structure as PymatgenStructure
from .pymatgen import from_pymatgen, get_pymatgen


Expand Down Expand Up @@ -55,4 +57,10 @@ class Structure(EntryAdapter):

_type_ingesters: Dict[str, Callable] = {
"pymatgen": from_pymatgen,
"ase": from_ase_atoms,
}

_type_ingesters_by_type: Dict[str, Type] = {
"pymatgen": PymatgenStructure,
"ase": ASEAtoms,
}
76 changes: 73 additions & 3 deletions optimade/adapters/structures/ase.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@
from typing import Dict

from optimade.adapters.exceptions import ConversionError
from optimade.adapters.structures.utils import species_from_species_at_sites
from optimade.adapters.structures.utils import (
elements_ratios_from_species_at_sites,
species_from_species_at_sites,
)
from optimade.models import Species as OptimadeStructureSpecies
from optimade.models import StructureFeatures
from optimade.models import StructureResource as OptimadeStructure
from optimade.models.structures import StructureResourceAttributes
from optimade.models.utils import anonymize_formula, reduce_formula

EXTRA_FIELD_PREFIX = "ase"

try:
from ase import Atom, Atoms
Expand All @@ -26,7 +33,7 @@
ASE_NOT_FOUND = "ASE not found, cannot convert structure to an ASE Atoms"


__all__ = ("get_ase_atoms",)
__all__ = ("get_ase_atoms", "from_ase_atoms")


def get_ase_atoms(optimade_structure: OptimadeStructure) -> Atoms:
Expand Down Expand Up @@ -82,6 +89,69 @@ def get_ase_atoms(optimade_structure: OptimadeStructure) -> Atoms:

atoms.append(Atom(symbol=species_name, position=site, mass=mass))

info = {}
for key in attributes.dict().keys():
if key.startswith("_"):
ase_key = key
if key.startswith(f"_{EXTRA_FIELD_PREFIX}_"):
ase_key = "".join(key.split(f"_{EXTRA_FIELD_PREFIX}_")[1:])
info[ase_key] = getattr(attributes, key)

return Atoms(
symbols=atoms, cell=attributes.lattice_vectors, pbc=attributes.dimension_types
symbols=atoms,
cell=attributes.lattice_vectors,
pbc=attributes.dimension_types,
info=info if info else None,
)


def from_ase_atoms(atoms: Atoms) -> StructureResourceAttributes:
"""Convert an ASE `Atoms` object into an OPTIMADE `StructureResourceAttributes` model.
Parameters:
atoms: The ASE `Atoms` object to convert.
Returns:
An OPTIMADE `StructureResourceAttributes` model, which can be converted to a raw Python
dictionary with `.dict()` or to JSON with `.json()`.
"""
if not isinstance(atoms, Atoms):
raise RuntimeError(
f"Cannot convert type {type(atoms)} into an OPTIMADE `StructureResourceAttributes` model."
)

attributes = {}
attributes["cartesian_site_positions"] = atoms.positions.tolist()
attributes["lattice_vectors"] = atoms.cell.tolist()
attributes["species_at_sites"] = atoms.get_chemical_symbols()
attributes["elements_ratios"] = elements_ratios_from_species_at_sites(
attributes["species_at_sites"]
)
attributes["species"] = species_from_species_at_sites(
attributes["species_at_sites"]
)
attributes["dimension_types"] = [int(_) for _ in atoms.pbc.tolist()]
attributes["nperiodic_dimensions"] = sum(attributes["dimension_types"])
attributes["nelements"] = len(attributes["species"])
attributes["elements"] = sorted([_.name for _ in attributes["species"]])
attributes["nsites"] = len(attributes["species_at_sites"])

attributes["chemical_formula_descriptive"] = atoms.get_chemical_formula()
attributes["chemical_formula_reduced"] = reduce_formula(
atoms.get_chemical_formula()
)
attributes["chemical_formula_anonymous"] = anonymize_formula(
attributes["chemical_formula_reduced"],
)
attributes["last_modified"] = None
attributes["immutable_id"] = None
attributes["structure_features"] = []

for key in atoms.info:
optimade_key = key.lower()
if not key.startswith(f"_{EXTRA_FIELD_PREFIX}"):
optimade_key = f"_{EXTRA_FIELD_PREFIX}_{optimade_key}"
attributes[optimade_key] = atoms.info[key]

return StructureResourceAttributes(**attributes)
41 changes: 6 additions & 35 deletions optimade/adapters/structures/pymatgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
from optimade.models import Species as OptimadeStructureSpecies
from optimade.models import StructureResource as OptimadeStructure
from optimade.models import StructureResourceAttributes
from optimade.models.utils import anonymize_formula, reduce_formula

try:
from pymatgen.core import Composition, Lattice, Molecule, Structure
from pymatgen.core import Lattice, Molecule, Structure

except (ImportError, ModuleNotFoundError):
from warnings import warn
Expand Down Expand Up @@ -168,14 +169,14 @@ def from_pymatgen(pmg_structure: Structure) -> StructureResourceAttributes:
attributes["dimension_types"] = [int(_) for _ in pmg_structure.lattice.pbc]
attributes["nperiodic_dimensions"] = sum(attributes["dimension_types"])
attributes["nelements"] = len(pmg_structure.composition.elements)
attributes["chemical_formula_anonymous"] = _pymatgen_anonymized_formula_to_optimade(
pmg_structure.composition
attributes["chemical_formula_anonymous"] = anonymize_formula(
pmg_structure.composition.formula
)
attributes["elements"] = sorted(
[_.symbol for _ in pmg_structure.composition.elements]
)
attributes["chemical_formula_reduced"] = _pymatgen_reduced_formula_to_optimade(
pmg_structure.composition
attributes["chemical_formula_reduced"] = reduce_formula(
pmg_structure.composition.formula
)
attributes["chemical_formula_descriptive"] = pmg_structure.composition.formula
attributes["elements_ratios"] = [
Expand All @@ -188,33 +189,3 @@ def from_pymatgen(pmg_structure: Structure) -> StructureResourceAttributes:
attributes["structure_features"] = []

return StructureResourceAttributes(**attributes)


def _pymatgen_anonymized_formula_to_optimade(composition: Composition) -> str:
"""Construct an OPTIMADE `chemical_formula_anonymous` from a pymatgen `Composition`."""
import re

from optimade.models.utils import anonymous_element_generator

return "".join(
[
"".join(x)
for x in zip(
anonymous_element_generator(),
reversed(re.split("[A-Z]", composition.anonymized_formula)[1:]),
)
]
)


def _pymatgen_reduced_formula_to_optimade(composition: Composition) -> str:
"""Construct an OPTIMADE `chemical_formula_reduced` from a pymatgen `Composition`."""
import numpy

numbers = [int(_) for _ in composition.to_reduced_dict.values()]
gcd = numpy.gcd.reduce(numbers)
return "".join(
_
+ f"{int(composition.to_reduced_dict[_]) // gcd if composition.to_reduced_dict[_] // gcd > 1 else ''}"
for _ in sorted([_.symbol for _ in composition.elements])
)
11 changes: 11 additions & 0 deletions optimade/adapters/structures/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,14 @@ def species_from_species_at_sites(
OptimadeStructureSpecies(name=_, concentration=[1.0], chemical_symbols=[_])
for _ in set(species_at_sites)
]


def elements_ratios_from_species_at_sites(species_at_sites: List[str]) -> List[float]:
"""Compute the OPTIMADE `elements_ratios` field from `species_at_sites` in the case where `species_at_sites` refers
to sites wholly occupied by the given elements, e.g., not arbitrary species labels or with partial/mixed occupancy.
"""
elements = set(species_at_sites)
counts = {e: species_at_sites.count(e) for e in elements}
num_sites = len(species_at_sites)
return [counts[e] / num_sites for e in sorted(elements)]
18 changes: 4 additions & 14 deletions optimade/models/structures.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
# pylint: disable=no-self-argument,line-too-long,no-name-in-module
import math
import re
import sys
import warnings
from enum import Enum, IntEnum
from functools import reduce
from typing import List, Optional, Union

from pydantic import BaseModel, conlist, root_validator, validator
Expand All @@ -18,6 +15,7 @@
OptimadeField,
StrictField,
SupportLevel,
reduce_formula,
)
from optimade.warnings import MissingExpectedField

Expand Down Expand Up @@ -895,18 +893,10 @@ def check_reduced_formulae(cls, value, field):
if value is None:
return value

numbers = [n.strip() or 1 for n in re.split(r"[A-Z][a-z]*", value)]
# Need to remove leading 1 from split and convert to ints
numbers = [int(n) for n in numbers[1:]]

if sys.version_info[1] >= 9:
gcd = math.gcd(*numbers)
else:
gcd = reduce(math.gcd, numbers)

if gcd != 1:
reduced_formula = reduce_formula(value)
if reduced_formula != value:
raise ValueError(
f"{field.name} {value!r} is not properly reduced: greatest common divisor was {gcd}, expected 1."
f"{field.name} {value!r} is not properly reduced: expected {reduced_formula!r}."
)

return value
Expand Down

0 comments on commit ad8a9fa

Please sign in to comment.