Skip to content

Commit

Permalink
feat(versioning): Add support for auto data versioning (#355)
Browse files Browse the repository at this point in the history
DEV-17: datamodels auto compute tag

Adds support for tagging nodes based on properties set in the dictionary. Adds the following sysan properties
* version: version number for the given node
* tag: a uuid string that represents nodes of same version (ie, all nodes with same tag are versions of each other)
* latest: True, if the given node is the latest
These values are evaluated using properties defined in the dictionary (currently only biodictionary defines those properties). The property name is tagProperties
  • Loading branch information
kulgan committed May 17, 2021
1 parent 7de4332 commit b7563d0
Show file tree
Hide file tree
Showing 16 changed files with 514 additions and 52 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
repos:
- repo: git@github.com:Yelp/detect-secrets
rev: v0.13.0
hooks:
Expand Down
4 changes: 2 additions & 2 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"files": "^.secrets.baseline$",
"lines": null
},
"generated_at": "2020-10-05T14:36:27Z",
"generated_at": "2021-04-05T20:11:28Z",
"plugins_used": [
{
"name": "AWSKeyDetector"
Expand Down Expand Up @@ -60,7 +60,7 @@
"hashed_secret": "5d0fa74acf95d1d6bebd0d37f76a94e77d604fd9",
"is_secret": false,
"is_verified": false,
"line_number": 73,
"line_number": 43,
"type": "Basic Auth Credentials"
}
],
Expand Down
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ install:
before_script:
- psql -U postgres -c "create user test with superuser password 'test';"
- psql -U postgres -c "create database automated_test with owner test;"
- psql -U postgres -c "create database dev_models with owner test;"

script:
- tox
Expand Down
55 changes: 43 additions & 12 deletions gdcdatamodel/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import os
import sys

from sqlalchemy.orm.attributes import flag_modified

try:
from functools import lru_cache
except ImportError:
Expand All @@ -38,7 +40,8 @@
qcreport,
released_data,
studyrule,
batch
batch,
versioning,
)

from sqlalchemy import (
Expand Down Expand Up @@ -353,6 +356,8 @@ def NodeFactory(_id, schema, node_cls=Node, package_namespace=None):
name = get_class_name_from_id(_id)
links = get_links(schema)

tag_props = schema.get("tagProperties")

@property
def node_id(self, value):
return self.node_id
Expand All @@ -361,6 +366,22 @@ def node_id(self, value):
def node_id(self, value):
self.node_id = value

@property
def tag_properties(self):
return tag_props

@property
def is_latest(self):
return self._sysan.get("latest", False)

@property
def version(self):
return self._sysan.get("version")

@property
def tag(self):
return self._sysan.get("tag")

# Pull the JSONB properties from the `properties` key
attributes = {
key: PropertyFactory(key, schema)
Expand All @@ -369,15 +390,16 @@ def node_id(self, value):
and key not in excluded_props
}

# Store for the programmer
#attributes['_dictionary'] = {
# 'category': schema.get('category'),
# 'title': schema.get('title'),
#}

skipped_dict_vals = [ '$schema', 'systemProperties',
'additionalProperties', 'links', 'properties',
'uniqueKeys', 'id' ]
skipped_dict_vals = [
'$schema',
'systemProperties',
'additionalProperties',
'links',
'properties',
'uniqueKeys',
'id' ,
'tagProperties'
]
attributes['_dictionary'] = {
key: schema[key] for key in schema if key not in skipped_dict_vals
}
Expand All @@ -404,6 +426,12 @@ def node_id(self, value):
related_cases_from_cache
)

if tag_props:
attributes["tag"] = tag
attributes["version"] = version
attributes["tag_properties"] = tag_properties
attributes["is_latest"] = is_latest

# _related_cases_from_parents: get ids of related cases from this
# nodes parents
attributes['_related_cases_from_parents'] = property(
Expand All @@ -424,6 +452,9 @@ def node_id(self, value):
cls_inject_versioned_nodes_lookup(cls)
cls_inject_secondary_keys(cls, schema)

if tag_props:
versioning.inject_set_tag_after_insert(cls)

node_cls.add_subclass(cls)
return cls

Expand Down Expand Up @@ -605,7 +636,7 @@ def parse_edge(src_label,
dst_label = dictionary.schema[dst_label]['id']
edge_name = ''.join(map(get_class_name_from_id, [
src_label, edge_label, dst_label]))

if edge_cls.is_subclass_loaded(name):
return '_{}_out'.format(edge_name)

Expand Down Expand Up @@ -640,7 +671,7 @@ def load_edges(dictionary, node_cls=Node, edge_cls=Edge, package_namespace=None)
for name, link in get_links(subschema).items():
edge_label = link['label']
edge_name = parse_edge(
src_label, name, edge_label, subschema, link,
src_label, name, edge_label, subschema, link,
dictionary=dictionary,
node_cls=node_cls,
edge_cls=edge_cls,
Expand Down
76 changes: 76 additions & 0 deletions gdcdatamodel/models/versioning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import uuid

from sqlalchemy import and_, event, select

UUID_NAMESPACE_SEED = os.getenv("UUID_NAMESPACE_SEED", "86bb916a-24c5-48e4-8a46-5ea73a379d47")
UUID_NAMESPACE = uuid.UUID("urn:uuid:{}".format(UUID_NAMESPACE_SEED), version=4)


def __generate_hash(seed, label):
namespace = UUID_NAMESPACE
name = "{}-{}".format(seed, label)
return str(uuid.uuid5(namespace, name))


def compute_tag(node):
"""Computes unique tag for given node
Args:
node (models.Node): mode instance
Returns:
str: computed tag
"""
keys = [node.node_id if p == "node_id" else node.props[p] for p in node.tag_properties]
keys += sorted([p.dst.tag or compute_tag(p.dst) for p in node.edges_out if p.label != "relates_to"])
return __generate_hash(keys, node.label)


def __get_tagged_version(node_id, table, tag, conn):
"""Super private function to figure out the proper version number to use just after insertion
Args:
node_id (str): current node_id
table (sqlalchemy.Table): node table instance
tag (str): currently computed tag
conn (sqlalchemy.engine.Connection): currently active connection instance
Returns:
int: appropriate version number to use. 1 greater than the current max
"""
query = select([table]).where(
and_(table.c._sysan["tag"].astext == tag, table.c.node_id != node_id)
)
max_version = 0
for r in conn.execute(query):
max_version = max(r._sysan.get("version", 0), max_version)

# reset latest
r._sysan["latest"] = False
conn.execute(
table.update().where(table.c.node_id == r.node_id).values(_sysan=r._sysan)
)
return max_version + 1


def inject_set_tag_after_insert(cls):
"""Injects an event listener that sets the tag and version properties on nodes, just before they are inserted
Args:
cls (class): node class type
"""

@event.listens_for(cls, "after_insert")
def set_node_tag(mapper, conn, node):
table = node.__table__
tag = compute_tag(node)

version = __get_tagged_version(node.node_id, table, tag, conn)

node._sysan["tag"] = tag
node._sysan["latest"] = True
node._sysan["version"] = version

# update tag and version
conn.execute(
table.update()
.where(table.c.node_id == node.node_id)
.values(_sysan=node._sysan)
)
3 changes: 2 additions & 1 deletion psql-users.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
psql -U postgres -c "create user test with superuser password 'test';"
psql -U postgres -c "create database automated_test with owner test;"
psql -U postgres -c "create database automated_test with owner test;"
psql -U postgres -c "create database dev_models with owner test;"
60 changes: 23 additions & 37 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,50 +8,20 @@
import random
import unittest
import uuid
import pkg_resources

import pytest
import yaml
from gdcdatamodel import models
from psqlgraph import PsqlGraphDriver, create_all, Node, Edge
from psqlgraph import PsqlGraphDriver, mocks
from sqlalchemy import create_engine


def create_tables(engine):
"""
create a table
"""
create_all(engine)
models.versioned_nodes.Base.metadata.create_all(engine)
models.submission.Base.metadata.create_all(engine)
models.redaction.Base.metadata.create_all(engine)
models.qcreport.Base.metadata.create_all(engine)
models.misc.Base.metadata.create_all(engine)
from test.helpers import truncate, create_tables
from test.models import BasicDictionary


def truncate(engine):
"""
Remove data from existing tables
"""
conn = engine.connect()
for table in Node.get_subclass_table_names():
if table != Node.__tablename__:
conn.execute('delete from {}'.format(table))
for table in Edge.get_subclass_table_names():
if table != Edge.__tablename__:
conn.execute('delete from {}'.format(table))

# Extend this list as needed
ng_models_metadata = [
models.versioned_nodes.Base.metadata,
models.submission.Base.metadata,
models.redaction.Base.metadata,
models.qcreport.Base.metadata,
models.misc.Base.metadata,
]

for meta in ng_models_metadata:
for table in meta.tables:
conn.execute("DELETE FROM {}".format(table))
conn.close()
models.load_dictionary(BasicDictionary, "basic")
from gdcdatamodel.models import basic # noqa


@pytest.fixture(scope='session')
Expand Down Expand Up @@ -155,3 +125,19 @@ def setUp(self):

def tearDown(self):
truncate(self.g.engine)


@pytest.fixture(scope="module")
def sample_data():
with pkg_resources.resource_stream(__name__, "schema/data/sample.yaml") as f:
graph = yaml.safe_load(f)

f = mocks.GraphFactory(basic, BasicDictionary)
nodes = f.create_from_nodes_and_edges(
nodes=graph["nodes"],
edges=graph["edges"],
unique_key="node_id",
all_props=True,
)

return nodes
66 changes: 66 additions & 0 deletions test/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import psqlgraph
from psqlgraph import Node, Edge, create_all, ext

from gdcdatamodel import models


def truncate(engine, namespace=None):
"""
Remove data from existing tables
"""
abstract_node = psqlgraph.Node
abstract_edge = psqlgraph.Edge
if namespace:
abstract_node = ext.get_abstract_node(namespace)
abstract_edge = ext.get_abstract_edge(namespace)
conn = engine.connect()
for table in abstract_node.get_subclass_table_names():
if table != abstract_node.__tablename__:
conn.execute('delete from {}'.format(table))
for table in abstract_edge.get_subclass_table_names():
if table != abstract_edge.__tablename__:
conn.execute('delete from {}'.format(table))

if not namespace:
# add ng models only to main graph model
truncate_ng_tables(conn)
conn.close()


def create_tables(engine, namespace=None):
"""
create a table
"""

base = psqlgraph.base.ORMBase
if namespace:
base = ext.get_orm_base(namespace)
create_all(engine, base)

if not namespace:
# add ng models only to main graph
create_ng_tables(engine)


def create_ng_tables(engine):
models.versioned_nodes.Base.metadata.create_all(engine)
models.submission.Base.metadata.create_all(engine)
models.redaction.Base.metadata.create_all(engine)
models.qcreport.Base.metadata.create_all(engine)
models.misc.Base.metadata.create_all(engine)


def truncate_ng_tables(conn):

# Extend this list as needed
ng_models_metadata = [
models.versioned_nodes.Base.metadata,
models.submission.Base.metadata,
models.redaction.Base.metadata,
models.qcreport.Base.metadata,
models.misc.Base.metadata,
]

for meta in ng_models_metadata:
for table in meta.tables:
conn.execute("DELETE FROM {}".format(table))
16 changes: 16 additions & 0 deletions test/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pkg_resources
import yaml


def _load(name):
with pkg_resources.resource_stream(__name__, name) as f:
return yaml.safe_load(f)


class Dictionary:

def __init__(self, name):
self.schema = _load(name)


BasicDictionary = Dictionary("schema/basic.yaml")

0 comments on commit b7563d0

Please sign in to comment.