From b7563d021f46d9f58659bb9081b97eefccc3e13e Mon Sep 17 00:00:00 2001 From: Rowland Ogwara Date: Mon, 17 May 2021 17:53:43 -0500 Subject: [PATCH] feat(versioning): Add support for auto data versioning (#355) DEV-17: datamodels auto compute tag Adds support for tagging nodes based on properties set in the dictionary. Adds the following sysan properties * version: version number for the given node * tag: a uuid string that represents nodes of same version (ie, all nodes with same tag are versions of each other) * latest: True, if the given node is the latest These values are evaluated using properties defined in the dictionary (currently only biodictionary defines those properties). The property name is tagProperties --- .pre-commit-config.yaml | 1 + .secrets.baseline | 4 +- .travis.yml | 1 + gdcdatamodel/models/__init__.py | 55 ++++++++++++---- gdcdatamodel/models/versioning.py | 76 +++++++++++++++++++++ psql-users.sh | 3 +- test/conftest.py | 60 +++++++---------- test/helpers.py | 66 +++++++++++++++++++ test/models.py | 16 +++++ test/sample.yaml | 17 +++++ test/schema/__init__.py | 0 test/schema/basic.yaml | 105 ++++++++++++++++++++++++++++++ test/schema/data/sample.yaml | 43 ++++++++++++ test/test_node_tagging.py | 62 ++++++++++++++++++ test/unit/__init__.py | 0 test/unit/test_tagging.py | 57 ++++++++++++++++ 16 files changed, 514 insertions(+), 52 deletions(-) create mode 100644 gdcdatamodel/models/versioning.py create mode 100644 test/helpers.py create mode 100644 test/models.py create mode 100644 test/sample.yaml create mode 100644 test/schema/__init__.py create mode 100644 test/schema/basic.yaml create mode 100644 test/schema/data/sample.yaml create mode 100644 test/test_node_tagging.py create mode 100644 test/unit/__init__.py create mode 100644 test/unit/test_tagging.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 24346d9b..1ad6162a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,4 @@ +repos: - repo: git@github.com:Yelp/detect-secrets rev: v0.13.0 hooks: diff --git a/.secrets.baseline b/.secrets.baseline index 7546ab3b..a1546e59 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": "^.secrets.baseline$", "lines": null }, - "generated_at": "2020-10-05T14:36:27Z", + "generated_at": "2021-04-05T20:11:28Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -60,7 +60,7 @@ "hashed_secret": "5d0fa74acf95d1d6bebd0d37f76a94e77d604fd9", "is_secret": false, "is_verified": false, - "line_number": 73, + "line_number": 43, "type": "Basic Auth Credentials" } ], diff --git a/.travis.yml b/.travis.yml index 1c39a36b..55d2501b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,7 @@ install: before_script: - psql -U postgres -c "create user test with superuser password 'test';" - psql -U postgres -c "create database automated_test with owner test;" + - psql -U postgres -c "create database dev_models with owner test;" script: - tox diff --git a/gdcdatamodel/models/__init__.py b/gdcdatamodel/models/__init__.py index 0d1d7ba3..3cd11561 100644 --- a/gdcdatamodel/models/__init__.py +++ b/gdcdatamodel/models/__init__.py @@ -17,6 +17,8 @@ import os import sys +from sqlalchemy.orm.attributes import flag_modified + try: from functools import lru_cache except ImportError: @@ -38,7 +40,8 @@ qcreport, released_data, studyrule, - batch + batch, + versioning, ) from sqlalchemy import ( @@ -353,6 +356,8 @@ def NodeFactory(_id, schema, node_cls=Node, package_namespace=None): name = get_class_name_from_id(_id) links = get_links(schema) + tag_props = schema.get("tagProperties") + @property def node_id(self, value): return self.node_id @@ -361,6 +366,22 @@ def node_id(self, value): def node_id(self, value): self.node_id = value + @property + def tag_properties(self): + return tag_props + + @property + def is_latest(self): + return self._sysan.get("latest", False) + + @property + def version(self): + return self._sysan.get("version") + + @property + def tag(self): + return self._sysan.get("tag") + # Pull the JSONB properties from the `properties` key attributes = { key: PropertyFactory(key, schema) @@ -369,15 +390,16 @@ def node_id(self, value): and key not in excluded_props } - # Store for the programmer - #attributes['_dictionary'] = { - # 'category': schema.get('category'), - # 'title': schema.get('title'), - #} - - skipped_dict_vals = [ '$schema', 'systemProperties', - 'additionalProperties', 'links', 'properties', - 'uniqueKeys', 'id' ] + skipped_dict_vals = [ + '$schema', + 'systemProperties', + 'additionalProperties', + 'links', + 'properties', + 'uniqueKeys', + 'id' , + 'tagProperties' + ] attributes['_dictionary'] = { key: schema[key] for key in schema if key not in skipped_dict_vals } @@ -404,6 +426,12 @@ def node_id(self, value): related_cases_from_cache ) + if tag_props: + attributes["tag"] = tag + attributes["version"] = version + attributes["tag_properties"] = tag_properties + attributes["is_latest"] = is_latest + # _related_cases_from_parents: get ids of related cases from this # nodes parents attributes['_related_cases_from_parents'] = property( @@ -424,6 +452,9 @@ def node_id(self, value): cls_inject_versioned_nodes_lookup(cls) cls_inject_secondary_keys(cls, schema) + if tag_props: + versioning.inject_set_tag_after_insert(cls) + node_cls.add_subclass(cls) return cls @@ -605,7 +636,7 @@ def parse_edge(src_label, dst_label = dictionary.schema[dst_label]['id'] edge_name = ''.join(map(get_class_name_from_id, [ src_label, edge_label, dst_label])) - + if edge_cls.is_subclass_loaded(name): return '_{}_out'.format(edge_name) @@ -640,7 +671,7 @@ def load_edges(dictionary, node_cls=Node, edge_cls=Edge, package_namespace=None) for name, link in get_links(subschema).items(): edge_label = link['label'] edge_name = parse_edge( - src_label, name, edge_label, subschema, link, + src_label, name, edge_label, subschema, link, dictionary=dictionary, node_cls=node_cls, edge_cls=edge_cls, diff --git a/gdcdatamodel/models/versioning.py b/gdcdatamodel/models/versioning.py new file mode 100644 index 00000000..84aaf42e --- /dev/null +++ b/gdcdatamodel/models/versioning.py @@ -0,0 +1,76 @@ +import os +import uuid + +from sqlalchemy import and_, event, select + +UUID_NAMESPACE_SEED = os.getenv("UUID_NAMESPACE_SEED", "86bb916a-24c5-48e4-8a46-5ea73a379d47") +UUID_NAMESPACE = uuid.UUID("urn:uuid:{}".format(UUID_NAMESPACE_SEED), version=4) + + +def __generate_hash(seed, label): + namespace = UUID_NAMESPACE + name = "{}-{}".format(seed, label) + return str(uuid.uuid5(namespace, name)) + + +def compute_tag(node): + """Computes unique tag for given node + Args: + node (models.Node): mode instance + Returns: + str: computed tag + """ + keys = [node.node_id if p == "node_id" else node.props[p] for p in node.tag_properties] + keys += sorted([p.dst.tag or compute_tag(p.dst) for p in node.edges_out if p.label != "relates_to"]) + return __generate_hash(keys, node.label) + + +def __get_tagged_version(node_id, table, tag, conn): + """Super private function to figure out the proper version number to use just after insertion + Args: + node_id (str): current node_id + table (sqlalchemy.Table): node table instance + tag (str): currently computed tag + conn (sqlalchemy.engine.Connection): currently active connection instance + + Returns: + int: appropriate version number to use. 1 greater than the current max + """ + query = select([table]).where( + and_(table.c._sysan["tag"].astext == tag, table.c.node_id != node_id) + ) + max_version = 0 + for r in conn.execute(query): + max_version = max(r._sysan.get("version", 0), max_version) + + # reset latest + r._sysan["latest"] = False + conn.execute( + table.update().where(table.c.node_id == r.node_id).values(_sysan=r._sysan) + ) + return max_version + 1 + + +def inject_set_tag_after_insert(cls): + """Injects an event listener that sets the tag and version properties on nodes, just before they are inserted + Args: + cls (class): node class type + """ + + @event.listens_for(cls, "after_insert") + def set_node_tag(mapper, conn, node): + table = node.__table__ + tag = compute_tag(node) + + version = __get_tagged_version(node.node_id, table, tag, conn) + + node._sysan["tag"] = tag + node._sysan["latest"] = True + node._sysan["version"] = version + + # update tag and version + conn.execute( + table.update() + .where(table.c.node_id == node.node_id) + .values(_sysan=node._sysan) + ) diff --git a/psql-users.sh b/psql-users.sh index e53e9d3c..a0390cf0 100644 --- a/psql-users.sh +++ b/psql-users.sh @@ -1,2 +1,3 @@ psql -U postgres -c "create user test with superuser password 'test';" -psql -U postgres -c "create database automated_test with owner test;" \ No newline at end of file +psql -U postgres -c "create database automated_test with owner test;" +psql -U postgres -c "create database dev_models with owner test;" \ No newline at end of file diff --git a/test/conftest.py b/test/conftest.py index 8f5e4c40..eac1c4db 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -8,50 +8,20 @@ import random import unittest import uuid +import pkg_resources import pytest +import yaml from gdcdatamodel import models -from psqlgraph import PsqlGraphDriver, create_all, Node, Edge +from psqlgraph import PsqlGraphDriver, mocks from sqlalchemy import create_engine - -def create_tables(engine): - """ - create a table - """ - create_all(engine) - models.versioned_nodes.Base.metadata.create_all(engine) - models.submission.Base.metadata.create_all(engine) - models.redaction.Base.metadata.create_all(engine) - models.qcreport.Base.metadata.create_all(engine) - models.misc.Base.metadata.create_all(engine) +from test.helpers import truncate, create_tables +from test.models import BasicDictionary -def truncate(engine): - """ - Remove data from existing tables - """ - conn = engine.connect() - for table in Node.get_subclass_table_names(): - if table != Node.__tablename__: - conn.execute('delete from {}'.format(table)) - for table in Edge.get_subclass_table_names(): - if table != Edge.__tablename__: - conn.execute('delete from {}'.format(table)) - - # Extend this list as needed - ng_models_metadata = [ - models.versioned_nodes.Base.metadata, - models.submission.Base.metadata, - models.redaction.Base.metadata, - models.qcreport.Base.metadata, - models.misc.Base.metadata, - ] - - for meta in ng_models_metadata: - for table in meta.tables: - conn.execute("DELETE FROM {}".format(table)) - conn.close() +models.load_dictionary(BasicDictionary, "basic") +from gdcdatamodel.models import basic # noqa @pytest.fixture(scope='session') @@ -155,3 +125,19 @@ def setUp(self): def tearDown(self): truncate(self.g.engine) + + +@pytest.fixture(scope="module") +def sample_data(): + with pkg_resources.resource_stream(__name__, "schema/data/sample.yaml") as f: + graph = yaml.safe_load(f) + + f = mocks.GraphFactory(basic, BasicDictionary) + nodes = f.create_from_nodes_and_edges( + nodes=graph["nodes"], + edges=graph["edges"], + unique_key="node_id", + all_props=True, + ) + + return nodes diff --git a/test/helpers.py b/test/helpers.py new file mode 100644 index 00000000..704a38cc --- /dev/null +++ b/test/helpers.py @@ -0,0 +1,66 @@ +import psqlgraph +from psqlgraph import Node, Edge, create_all, ext + +from gdcdatamodel import models + + +def truncate(engine, namespace=None): + """ + Remove data from existing tables + """ + abstract_node = psqlgraph.Node + abstract_edge = psqlgraph.Edge + if namespace: + abstract_node = ext.get_abstract_node(namespace) + abstract_edge = ext.get_abstract_edge(namespace) + conn = engine.connect() + for table in abstract_node.get_subclass_table_names(): + if table != abstract_node.__tablename__: + conn.execute('delete from {}'.format(table)) + for table in abstract_edge.get_subclass_table_names(): + if table != abstract_edge.__tablename__: + conn.execute('delete from {}'.format(table)) + + if not namespace: + # add ng models only to main graph model + truncate_ng_tables(conn) + conn.close() + + +def create_tables(engine, namespace=None): + """ + create a table + """ + + base = psqlgraph.base.ORMBase + if namespace: + base = ext.get_orm_base(namespace) + create_all(engine, base) + + if not namespace: + # add ng models only to main graph + create_ng_tables(engine) + + +def create_ng_tables(engine): + models.versioned_nodes.Base.metadata.create_all(engine) + models.submission.Base.metadata.create_all(engine) + models.redaction.Base.metadata.create_all(engine) + models.qcreport.Base.metadata.create_all(engine) + models.misc.Base.metadata.create_all(engine) + + +def truncate_ng_tables(conn): + + # Extend this list as needed + ng_models_metadata = [ + models.versioned_nodes.Base.metadata, + models.submission.Base.metadata, + models.redaction.Base.metadata, + models.qcreport.Base.metadata, + models.misc.Base.metadata, + ] + + for meta in ng_models_metadata: + for table in meta.tables: + conn.execute("DELETE FROM {}".format(table)) \ No newline at end of file diff --git a/test/models.py b/test/models.py new file mode 100644 index 00000000..177879dd --- /dev/null +++ b/test/models.py @@ -0,0 +1,16 @@ +import pkg_resources +import yaml + + +def _load(name): + with pkg_resources.resource_stream(__name__, name) as f: + return yaml.safe_load(f) + + +class Dictionary: + + def __init__(self, name): + self.schema = _load(name) + + +BasicDictionary = Dictionary("schema/basic.yaml") diff --git a/test/sample.yaml b/test/sample.yaml new file mode 100644 index 00000000..7229e6b3 --- /dev/null +++ b/test/sample.yaml @@ -0,0 +1,17 @@ +nodes: + - label: program + name: GDC + node_id: pg_1 + - label: project + name: MISC + code: MISC + node_id: pj_1 + - label: case + project_id: GDC-MISC + submitter_id: SAMPLE_! + node_id: case_1 +edges: + - src: pg_1 + dst: pj_1 + - src: pj_1 + dst: case_1 diff --git a/test/schema/__init__.py b/test/schema/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/schema/basic.yaml b/test/schema/basic.yaml new file mode 100644 index 00000000..63d8b3a5 --- /dev/null +++ b/test/schema/basic.yaml @@ -0,0 +1,105 @@ +# dummy dictionary for testing purposes +program: + id: program + category: administrative + tagProperties: + - name + required: + - name + properties: + name: + type: string +project: + id: project + category: administrative + tagProperties: + - code + required: + - code + properties: + code: + type: string + name: + type: string + links: + - name: programs + backref: projects + label: member_of + target_type: program + multiplicity: many_to_one + required: true +case: + id: case + category: administrative + tagProperties: + - submitter_id + required: + - submitter_id + properties: + submitter_id: + type: string + consent_type: + enum: + - Consent by Death + - Consent Exemption + - Consent Waiver + - Informed Consent + links: + - name: projects + backref: cases + label: member_of + target_type: project + multiplicity: many_to_one + required: true +sample: + id: sample + category: biospecimen + tagProperties: + - submitter_id + required: + - submitter_id + properties: + submitter_id: + type: string + catalog_reference: + type: string + links: + - name: cases + backref: samples + label: derived_from + target_type: case + multiplicity: many_to_one + required: true +center: + id: center + category: administrative + required: + - code + properties: + code: + type: string + tagProperties: + - code +portion: + id: portion + category: biospecimen + required: + - submitter_id + properties: + submitter_id: + type: string + tagProperties: + - submitter_id + links: + - name: samples + backref: portions + label: derived_from + target_type: sample + multiplicity: many_to_one + required: true + - name: centers + backref: portions + label: shipped_to + target_type: center + multiplicity: many_to_one + required: false diff --git a/test/schema/data/sample.yaml b/test/schema/data/sample.yaml new file mode 100644 index 00000000..731d6534 --- /dev/null +++ b/test/schema/data/sample.yaml @@ -0,0 +1,43 @@ +nodes: + - label: program + name: GDC + node_id: ed9aa864-1e40-4657-9378-7e3dc26551cc + - label: project + code: MISC + node_id: c6a795f6-ee4a-4fcd-bfed-79348e07cd49 + - label: center + code: T1 + node_id: fb69d25b-5c5d-4879-8955-8f2126e57524 + - label: case + node_id: be66197b-f6cc-4366-bded-365856ec4f63 + submitter_id: BSC_1 + - label: case # version 2 case + node_id: a2b2d27a-6523-4ddd-8b2e-e94437a2aa23 + submitter_id: BSC_1 + - label: sample + node_id: 813f97c4-dffc-4f94-b3f6-66a93476a233 + submitter_id: sample_1 + catelog_reference: Zoom + - label: portion + node_id: 6974c692-be47-4cb8-b8d6-9bd815983cd9 + submitter_id: portion_1 + - label: portion # version 2 portion + node_id: 5ffb4b0e-969e-4643-8187-536ce7130e9c + submitter_id: portion_1 +edges: + - src: c6a795f6-ee4a-4fcd-bfed-79348e07cd49 + dst: ed9aa864-1e40-4657-9378-7e3dc26551cc + - src: be66197b-f6cc-4366-bded-365856ec4f63 + dst: c6a795f6-ee4a-4fcd-bfed-79348e07cd49 + - src: a2b2d27a-6523-4ddd-8b2e-e94437a2aa23 + dst: c6a795f6-ee4a-4fcd-bfed-79348e07cd49 + - src: 813f97c4-dffc-4f94-b3f6-66a93476a233 + dst: be66197b-f6cc-4366-bded-365856ec4f63 + - src: 6974c692-be47-4cb8-b8d6-9bd815983cd9 + dst: 813f97c4-dffc-4f94-b3f6-66a93476a233 + - src: 6974c692-be47-4cb8-b8d6-9bd815983cd9 + dst: fb69d25b-5c5d-4879-8955-8f2126e57524 + - src: 5ffb4b0e-969e-4643-8187-536ce7130e9c + dst: 813f97c4-dffc-4f94-b3f6-66a93476a233 + - src: 5ffb4b0e-969e-4643-8187-536ce7130e9c + dst: fb69d25b-5c5d-4879-8955-8f2126e57524 diff --git a/test/test_node_tagging.py b/test/test_node_tagging.py new file mode 100644 index 00000000..bd37f1b4 --- /dev/null +++ b/test/test_node_tagging.py @@ -0,0 +1,62 @@ +import pytest +from psqlgraph import PsqlGraphDriver + +from gdcdatamodel.models import basic # noqa +from test.helpers import create_tables, truncate + + +@pytest.fixture(scope='module') +def bg(): + """Fixture for database driver""" + + cfg = { + 'host': 'localhost', + 'user': 'test', + 'password': 'test', + 'database': 'dev_models', + 'package_namespace': 'basic', + } + + g = PsqlGraphDriver(**cfg) + create_tables(g.engine, namespace="basic") + yield g + truncate(g.engine, namespace="basic") + + +@pytest.fixture(scope="module") +def create_samples(sample_data, bg): + with bg.session_scope() as s: + + version_2s = [] + for node in sample_data: + # delay adding version 2 + if node.node_id in ["a2b2d27a-6523-4ddd-8b2e-e94437a2aa23", "5ffb4b0e-969e-4643-8187-536ce7130e9c"]: + version_2s.append(node) + continue + s.add(node) + s.commit() + for v2 in version_2s: + s.add(v2) + yield + + with bg.session_scope(): + for n in sample_data: + bg.node_delete(n.node_id) + + +@pytest.mark.parametrize("node_id, tag, version", [ + ("be66197b-f6cc-4366-bded-365856ec4f63", "84044bd2-54a4-5837-b83d-f920eb97c18d", 1), + ("a2b2d27a-6523-4ddd-8b2e-e94437a2aa23", "84044bd2-54a4-5837-b83d-f920eb97c18d", 2), + ("813f97c4-dffc-4f94-b3f6-66a93476a233", "9a81bbad-b525-568c-b85d-d269a8bdc70a", 1), + ("6974c692-be47-4cb8-b8d6-9bd815983cd9", "55814b2f-fc23-5bed-9eab-c73c52c105df", 1), + ("5ffb4b0e-969e-4643-8187-536ce7130e9c", "55814b2f-fc23-5bed-9eab-c73c52c105df", 2), + ("c6a795f6-ee4a-4fcd-bfed-79348e07cd49", "8cc95392-5861-5524-8b98-a85e18d8294c", 1), + ("ed9aa864-1e40-4657-9378-7e3dc26551cc", "fddc5826-8853-5c1a-847d-5850d58ccb3e", 1), + ("fb69d25b-5c5d-4879-8955-8f2126e57524", "293d5dd3-117c-5a0a-8030-a428fdf2681b", 1), +]) +def test_1(create_samples, bg, node_id, tag, version): + + with bg.session_scope(): + node = bg.nodes().get(node_id) + assert node.tag == tag + assert node.version == version diff --git a/test/unit/__init__.py b/test/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/unit/test_tagging.py b/test/unit/test_tagging.py new file mode 100644 index 00000000..100a5185 --- /dev/null +++ b/test/unit/test_tagging.py @@ -0,0 +1,57 @@ +from gdcdatamodel.models import versioning as v +from gdcdatamodel.models import basic # noqa + + +EXPECTED_TAGS = { + "be66197b-f6cc-4366-bded-365856ec4f63": "84044bd2-54a4-5837-b83d-f920eb97c18d", + "a2b2d27a-6523-4ddd-8b2e-e94437a2aa23": "84044bd2-54a4-5837-b83d-f920eb97c18d", + "813f97c4-dffc-4f94-b3f6-66a93476a233": "9a81bbad-b525-568c-b85d-d269a8bdc70a", + "6974c692-be47-4cb8-b8d6-9bd815983cd9": "55814b2f-fc23-5bed-9eab-c73c52c105df", + "5ffb4b0e-969e-4643-8187-536ce7130e9c": "55814b2f-fc23-5bed-9eab-c73c52c105df", + "c6a795f6-ee4a-4fcd-bfed-79348e07cd49": "8cc95392-5861-5524-8b98-a85e18d8294c", + "ed9aa864-1e40-4657-9378-7e3dc26551cc": "fddc5826-8853-5c1a-847d-5850d58ccb3e", + "fb69d25b-5c5d-4879-8955-8f2126e57524": "293d5dd3-117c-5a0a-8030-a428fdf2681b", +} + + +def test_compute_tag(sample_data): + """Tests version tags are computed correctly per node""" + + for node in sample_data: + print("\n..........{}...........".format(node)) + v_tag = v.compute_tag(node) + assert v_tag == EXPECTED_TAGS[node.node_id], "invalid tag computed for {}".format(node.node_id) + + +def test_multi_parent(sample_data): + """Test version tag resolves to the same value independent of how the parents were attached""" + + portion = basic.Portion(node_id="b9b6fdb3-6c31-4ed3-9f8c-67d4eae72102", submitter_id="portion_2") + v_tag = v.compute_tag(portion) + assert v_tag == "5776f97a-a58b-5900-83da-43cbc7105796" + + sample = center = None + for node in sample_data: + if node.label == "center": + center = node + elif node.label == "sample": + sample = node + + if not all([center, sample]): + assert False + + portion.samples.append(sample) + portion.centers.append(center) + v_tag = v.compute_tag(portion) + assert v_tag == "a9a67fae-d916-5843-bdf3-b7db0b7a82a2" + + # unlink + portion.samples = [] + portion.centers = [] + v_tag = v.compute_tag(portion) + assert v_tag == "5776f97a-a58b-5900-83da-43cbc7105796" + + portion.centers.append(center) + portion.samples.append(sample) + v_tag = v.compute_tag(portion) + assert v_tag == "a9a67fae-d916-5843-bdf3-b7db0b7a82a2"