Skip to content

Commit

Permalink
Merge pull request #7 from MolSSI/provenance
Browse files Browse the repository at this point in the history
Provenance
  • Loading branch information
dgasmith committed Oct 26, 2018
2 parents 93fff9c + 0bf082c commit 68aa684
Show file tree
Hide file tree
Showing 17 changed files with 673 additions and 207 deletions.
145 changes: 102 additions & 43 deletions qcelemental/molparse/from_arrays.py
@@ -1,10 +1,13 @@
import re
import pprint
import keyword

import numpy as np

from ..util import distance_matrix, update_with_error, unnp
from ..util import distance_matrix, update_with_error, unnp, provenance_stamp
from ..physical_constants import constants
from ..exceptions import ValidationError
from .regex import VERSION_PATTERN
from .chgmult import validate_and_fill_chgmult
from .nucleus import reconcile_nucleus

Expand Down Expand Up @@ -124,38 +127,42 @@ def from_input_arrays(
return molinit


def from_arrays(geom=None,
elea=None,
elez=None,
elem=None,
mass=None,
real=None,
elbl=None,
name=None,
units='Angstrom',
input_units_to_au=None,
fix_com=None,
fix_orientation=None,
fix_symmetry=None,
fragment_separators=None,
fragment_charges=None,
fragment_multiplicities=None,
molecular_charge=None,
molecular_multiplicity=None,
fragment_files=None,
hint_types=None,
geom_hints=None,
geom_unsettled=None,
variables=None,
domain='qm',
missing_enabled_return='error',
np_out=True,
speclabel=True,
tooclose=0.1,
zero_ghost_fragments=False,
nonphysical=False,
mtol=1.e-3,
verbose=1):
def from_arrays(
geom=None,
elea=None,
elez=None,
elem=None,
mass=None,
real=None,
elbl=None,
name=None,
units='Angstrom',
input_units_to_au=None,
fix_com=None,
fix_orientation=None,
fix_symmetry=None,
fragment_separators=None,
fragment_charges=None,
fragment_multiplicities=None,
molecular_charge=None,
molecular_multiplicity=None,
comment=None,
provenance=None,
#connectivity=None,
fragment_files=None,
hint_types=None,
geom_hints=None,
geom_unsettled=None,
variables=None,
domain='qm',
missing_enabled_return='error',
np_out=True,
speclabel=True,
tooclose=0.1,
zero_ghost_fragments=False,
nonphysical=False,
mtol=1.e-3,
verbose=1):
"""Compose a Molecule dict from unvalidated arrays and variables, returning dict.
See fields of Return molrec below. Required parameters (for QM XYZ)
Expand Down Expand Up @@ -187,7 +194,7 @@ def from_arrays(geom=None,
Returns
-------
molrec : dict
Molecule dictionary spec follows. Its principles are
Molecule dictionary spec follows. Its principles are
(1) contents are fully validated and defaulted - no error
checking necessary,
Expand All @@ -198,7 +205,7 @@ def from_arrays(geom=None,
(3) big system, nat-length single-type arrays, not small system,
nat-number heterogeneous objects,
(4) some fields are optional (e.g., symmetry) but largely
(4) some fields are optional (e.g., fix_symmetry) but largely
self-describing so units or fix_com must be present.
(5) apart from some mild optional fields, _all_ fields will
Expand All @@ -208,7 +215,7 @@ def from_arrays(geom=None,
and post-handshake they will be joined by full qm-like molrec.
(6) molrec should be idempotent through this function (equiv to
schema validator) but are not idempostent throughout its life. if
schema validator) but are not idempotent throughout its life. if
fields permit, frame may be changed. Future? if fields permit,
mol may be symmetrized. Coordinates and angles may change units
or range if program returns them in only one form.
Expand Down Expand Up @@ -249,6 +256,12 @@ def from_arrays(geom=None,
total charge on system.
molecular_multiplicity : int
total multiplicity on system.
comment : str, optional
Additional comment for molecule.
provenance : list of dict of str
Accumulated history of molecule, with fields "creator", "version", "routine".
connectivity : list of tuples of int, optional
(nbond, 3) list of (0-indexed) (atomA, atomB, bond_order) tuples
EFP extension (this + units is minimal)
Expand Down Expand Up @@ -304,7 +317,10 @@ def from_arrays(geom=None,
name=name,
units=units,
input_units_to_au=input_units_to_au,
comment=comment,
provenance=provenance,
always_return_iutau=False) # yapf: disable
processed['provenance'].append(provenance_stamp(__name__))
update_with_error(molinit, processed)

if domain == 'efp':
Expand Down Expand Up @@ -384,12 +400,55 @@ def from_arrays(geom=None,
return molinit


def validate_and_fill_units(name=None, units='Angstrom', input_units_to_au=None, always_return_iutau=False):
def validate_and_fill_units(name=None,
units='Angstrom',
input_units_to_au=None,
comment=None,
provenance=None,
always_return_iutau=False):
molinit = {}

if name is not None:
molinit['name'] = name

if comment is not None:
molinit['comment'] = comment

def validate_provenance(dicary):
expected_prov_keys = ['creator', 'routine', 'version']
try:
prov_keys = sorted(dicary.keys())
except AttributeError:
raise ValidationError("Provenance entry is not dictionary: {}".format(dicary))

if prov_keys == expected_prov_keys:
if not isinstance(dicary['creator'], str):
raise ValidationError(
"""Provenance key 'creator' should be string of creating program's name: {}""".format(
dicary['creator']))
if not re.fullmatch(VERSION_PATTERN, dicary['version'], re.VERBOSE):
raise ValidationError("""Provenance key 'version' should be a valid PEP 440 string: {}""".format(
dicary['version']))
if not isinstance(dicary['routine'], str):
raise ValidationError(
"""Provenance key 'routine' should be string of creating function's name: {}""".format(
dicary['routine']))
return True
else:
raise ValidationError('Provenance keys ({}) incorrect: {}'.format(expected_prov_keys, prov_keys))

if provenance is None:
molinit['provenance'] = []
else:
if isinstance(provenance, dict):
if validate_provenance(provenance):
molinit['provenance'] = [provenance]
else:
for prov in provenance:
if validate_provenance(prov):
pass
molinit['provenance'] = provenance

if units.capitalize() in ['Angstrom', 'Bohr']:
molinit['units'] = units.capitalize()
else:
Expand Down Expand Up @@ -467,8 +526,8 @@ def validate_and_fill_efp(fragment_files=None, hint_types=None, geom_hints=None)
or not (len(fragment_files) == len(hint_types) == len(geom_hints))):

raise ValidationError(
"""Missing or inconsistent length among efp quantities: fragment_files ({}), hint_types ({}), and geom_hints ({})""".
format(fragment_files, hint_types, geom_hints))
"""Missing or inconsistent length among efp quantities: fragment_files ({}), hint_types ({}), and geom_hints ({})"""
.format(fragment_files, hint_types, geom_hints))

# NOTE: imposing case on file
try:
Expand Down Expand Up @@ -621,8 +680,8 @@ def validate_and_fill_fragments(nat, fragment_separators=None, fragment_charges=
format(split_geom))
if sum(len(f) for f in split_geom) != nat:
raise ValidationError(
"""fragment_separators ({}) yields overlapping fragment(s) after trial np.split on geometry, possibly unsorted.""".
format(split_geom))
"""fragment_separators ({}) yields overlapping fragment(s) after trial np.split on geometry, possibly unsorted."""
.format(split_geom))
frs = fragment_separators
nfr = len(split_geom)

Expand Down Expand Up @@ -665,8 +724,8 @@ def validate_and_fill_unsettled_geometry(geom_unsettled, variables):
for il in range(len(lgeom) - 1):
if lgeom[il + 1] not in allowed_to_follow[lgeom[il]]:
raise ValidationError(
"""This is not how a Zmat works - aim for lower triangular. Line len ({}) may be followed by line len ({}), not ({}).""".
format(lgeom[il], allowed_to_follow[lgeom[il]], lgeom[il + 1]))
"""This is not how a Zmat works - aim for lower triangular. Line len ({}) may be followed by line len ({}), not ({})."""
.format(lgeom[il], allowed_to_follow[lgeom[il]], lgeom[il + 1]))

if not all(len(v) == 2 for v in variables):
raise ValidationError("""Variables should come in pairs: {}""".format(variables))
Expand Down
11 changes: 7 additions & 4 deletions qcelemental/molparse/from_schema.py
@@ -1,5 +1,6 @@
import numpy as np

from ..util import provenance_stamp
from ..exceptions import ValidationError
from .from_arrays import from_arrays

Expand All @@ -22,10 +23,6 @@ def from_schema(molschema, verbose=1):
"""

if (molschema.get('schema_name', '').startswith('qc_schema') and (molschema.get('schema_version', '') == 1)):
# Lost Fields
# -----------
# * 'comment'
# * 'provenance'
ms = molschema['molecule']

if 'fragments' in ms:
Expand Down Expand Up @@ -63,14 +60,20 @@ def from_schema(molschema, verbose=1):
fragment_multiplicities=ms.get('fragment_multiplicities', None),
molecular_charge=ms.get('molecular_charge', None),
molecular_multiplicity=ms.get('molecular_multiplicity', None),
comment=ms.get('comment', None),
provenance=ms.get('provenance', None),
domain='qm',
#missing_enabled_return=missing_enabled_return,
speclabel=False,
#tooclose=tooclose,
#zero_ghost_fragments=zero_ghost_fragments,
#nonphysical=nonphysical,
#mtol=mtol,
verbose=verbose)

# replace from_arrays stamp with from_schema stamp
molrec['provenance'][-1] = provenance_stamp(__name__)

else:
raise ValidationError("""Schema not recognized, schema_name/schema_version: {}/{} """.format(
molschema.get('schema_name', '(none)'), molschema.get('schema_version', '(none)')))
Expand Down
8 changes: 7 additions & 1 deletion qcelemental/molparse/from_string.py
@@ -1,7 +1,7 @@
import re
import pprint

from ..util import filter_comments
from ..util import filter_comments, provenance_stamp
from ..exceptions import ValidationError, ChoicesError, MoleculeFormatError
from .from_arrays import from_input_arrays
from .regex import NUCLEUS, NUMBER, SEP, ENDL, CHGMULT, CARTXYZ
Expand Down Expand Up @@ -259,6 +259,12 @@ def parse_as_psi4_ish(molstr, unsettled):
missing_enabled_return_efp=missing_enabled_return_efp,
**molinit)

# replace from_arrays stamp with from_string stamp
if 'qm' in molrec and molrec['qm']:
molrec['qm']['provenance'][-1] = provenance_stamp(__name__)
if 'efp' in molrec and molrec['efp']:
molrec['efp']['provenance'][-1] = provenance_stamp(__name__)

if verbose >= 2:
print('\nFROM_STRING MOLREC <<<', molrec, '>>>\n')

Expand Down
36 changes: 36 additions & 0 deletions qcelemental/molparse/regex.py
@@ -1,3 +1,5 @@
# remember to use re.VERBOSE with NUCLEUS, NUMBER, VERSION_PATTERN

NUCLEUS = r"""(?:
(?P<gh1>@)|(?P<gh2>Gh\())? # optional ghost: @stuff or Gh(stuff) ...
( # mandatory element: AEuser or Zuser
Expand All @@ -24,3 +26,37 @@

CHGMULT = r"""(?P<chg>""" + NUMBER + r')' + SEP + r"""(?P<mult>\d+)"""
CARTXYZ = r'(?P<x>' + NUMBER + r')' + SEP + r'(?P<y>' + NUMBER + r')' + SEP + r'(?P<z>' + NUMBER + r')'

# PEP 440 valid versions
# * to avoid dependency on module "packaging", copied from https://github.com/pypa/packaging/blob/master/packaging/version.py#L182-L213
# * Deliberately not anchored to the start and end of the string, to make it easier for 3rd party code to reuse
VERSION_PATTERN = r"""
v?
(?:
(?:(?P<epoch>[0-9]+)!)? # epoch
(?P<release>[0-9]+(?:\.[0-9]+)*) # release segment
(?P<pre> # pre-release
[-_\.]?
(?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview))
[-_\.]?
(?P<pre_n>[0-9]+)?
)?
(?P<post> # post release
(?:-(?P<post_n1>[0-9]+))
|
(?:
[-_\.]?
(?P<post_l>post|rev|r)
[-_\.]?
(?P<post_n2>[0-9]+)?
)
)?
(?P<dev> # dev release
[-_\.]?
(?P<dev_l>dev)
[-_\.]?
(?P<dev_n>[0-9]+)?
)?
)
(?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))? # local version
"""

0 comments on commit 68aa684

Please sign in to comment.