Skip to content

Commit

Permalink
Add basic support for downloading collections (closes #339)
Browse files Browse the repository at this point in the history
This commit implements `hca download-collection`. (See #351, #339.)
This implementation is not complete (see #356).
  • Loading branch information
natanlao committed Jul 16, 2019
1 parent aac72c3 commit 7847a6a
Show file tree
Hide file tree
Showing 3 changed files with 285 additions and 2 deletions.
94 changes: 93 additions & 1 deletion hca/dss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import hashlib
import os
import re
import tempfile
import time
import uuid
from io import open
Expand Down Expand Up @@ -80,7 +81,7 @@ class DSSClient(SwaggerClient):

def __init__(self, *args, **kwargs):
super(DSSClient, self).__init__(*args, **kwargs)
self.commands += [self.upload, self.download, self.download_manifest, self.create_version]
self.commands += [self.upload, self.download, self.download_manifest, self.create_version, self.download_collection]

def create_version(self):
"""
Expand Down Expand Up @@ -655,3 +656,94 @@ def _parse_manifest(cls, manifest):
delimiter = b'\t' if USING_PYTHON2 else '\t'
reader = csv.DictReader(f, delimiter=delimiter, quoting=csv.QUOTE_NONE)
return reader.fieldnames, list(reader)

def _serialize_col_to_manifest(self, uuid, replica, version):
"""
Given a collection UUID, uses GET `/collection/{uuid}` to
serialize the collection into a set of dicts that that can be
used to generate a manifest file.
Most of the heavy lifting is handled by
:meth:`DSSClient.download_manifest`.
:param uuid: uuid of the collection to serialize
:param replica: replica to query against
:param version: version of the specified collection
"""
errors = 0
rows = []
seen = []
col = self.get_collection(uuid=uuid, replica=replica, version=version)['contents']
while col:
obj = col.pop()
if obj['type'] == 'file':
# Currently cannot download files not associated with a
# bundle. This is a limitation of :meth:`download_manifest`
errors += 1
logger.warning("Failed to download file %s version %s",
obj['uuid'], obj['version'])
elif obj['type'] == 'collection':
if (obj['uuid'], obj['version']) in seen:
logger.info("Ignoring already-seen collection %s version %s",
obj['uuid'], obj['version'])
continue
seen.append((obj['uuid'], obj['version']))
col.extend(self.get_collection(uuid=obj['uuid'], replica=replica,
version=obj.get('version', ''))['contents'])
elif obj['type'] == 'bundle':
bundle = self._bundle_download_tasks(bundle_uuid=obj['uuid'],
replica=replica,
version=obj.get('version', ''))
rows.extend(({
'bundle_uuid': obj['uuid'],
'bundle_version': obj.get('version', None),
'file_name': f.name,
'file_sha256': f.sha256,
'file_uuid': f.uuid,
'file_size': f.size,
'file_version': f.version} for f, _ in bundle))
else:
errors += 1
logger.warning("Failed to download file %s version %s",
obj['uuid'], obj['version'])
if errors:
raise RuntimeError("%d download failure(s)..." % errors)
return rows

def download_collection(self, uuid, replica, version=None, download_dir=''):
"""
Download a bundle and save it to the local filesystem as a directory.
:param str uuid: The uuid of the collection to download
:param str replica: the replica to download from. The supported
replicas are: `aws` for Amazon Web Services, and `gcp` for
Google Cloud Platform. [aws, gcp]
:param str version: The version to download, else if not specified,
download the latest. The version is a timestamp of bundle creation
in RFC3339
:param str download_dir: The directory into which to download
Download a bundle and save it to the local filesystem as a directory.
"""
collection = self._serialize_col_to_manifest(uuid, replica, version)
# Explicitly declare mode `w` (default `w+b`) for Python 3 string compat
with tempfile.NamedTemporaryFile(mode='w') as manifest:
tsv = csv.DictWriter(manifest,
fieldnames=('bundle_uuid',
'bundle_version',
'file_name',
'file_sha256',
'file_uuid',
'file_version',
'file_size'),
delimiter=str('\t'), # cast for py2.7 compat
quoting=csv.QUOTE_NONE)
tsv.writeheader()
tsv.writerows(collection)
# Flushing the I/O buffer here is preferable to closing the file
# handle and deleting the temporary file later because within the
# context manager there is a guarantee that the temporary file
# will be deleted when we are done
manifest.flush()
self.download_manifest(manifest=manifest.name, replica=replica,
download_dir=download_dir, layout='bundle')
105 changes: 105 additions & 0 deletions test/integration/dss/test_dss_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@

from __future__ import absolute_import, division, print_function, unicode_literals

import contextlib
import filecmp
import json
import os
import sys
import unittest
import uuid
import tempfile
import shutil

Expand All @@ -16,8 +19,14 @@
import hca
import hca.cli
import hca.dss
import hca.util.exceptions
from hca.util.compat import USING_PYTHON2
from test import CapturingIO, reset_tweak_changes, TEST_DIR

if USING_PYTHON2:
import backports.tempfile
tempfile = backports.tempfile


class TestDssCLI(unittest.TestCase):
def test_post_search_cli(self):
Expand Down Expand Up @@ -102,6 +111,102 @@ def test_version_output(self):
print(stdout.captured())
self.assertTrue(stdout.captured())

@staticmethod
@contextlib.contextmanager
def _put_test_col(args, uuid=str(uuid.uuid4()), replica='aws'):
"""
Implements a context manager that PUTs a collection to the
data store using `hca dss put-collection` then deletes it
when done.
:param list args: arguments to pass to `hca dss put-collection`
:param str uuid: uuid of the collection
:param str replica: replica to use
:rtype: dict
:returns: put-collection response object
"""
base_args = ['dss', 'put-collection', '--replica', replica,
'--uuid', uuid]
with CapturingIO('stdout') as stdout:
hca.cli.main(args=base_args + args)
yield json.loads(stdout.captured())
base_args[1] = 'delete-collection'
with CapturingIO('stdout'):
hca.cli.main(args=base_args)

@staticmethod
@contextlib.contextmanager
def _put_test_bdl(dirpath=os.path.join(TEST_DIR, 'res', 'bundle'),
replica='aws',
staging_bucket='org-humancellatlas-dss-cli-test'):
"""
Implements a context manager that uploads a bundle to the data
store using `hca dss upload` then deletes it when done, if the
user has the requisite permissions.
:param str dirpath: path of the directory to upload (`--src-dir`)
:param str replica: replica to use (`--replica`)
:param str staging_bucket`: passed to `--staging-bucket`
:rtype: dict
:returns: upload response object
"""
put_args = ['dss', 'upload', '--src-dir', dirpath, '--replica',
replica, '--staging-bucket', staging_bucket]
with CapturingIO('stdout') as stdout:
hca.cli.main(args=put_args)
rv = json.loads(stdout.captured())
yield rv
del_args = ['dss', 'delete-bundle', '--uuid', rv['bundle_uuid'],
'--replica', replica, '--reason', 'tear down test bundle']
try:
with CapturingIO('stdout'):
hca.cli.main(args=del_args)
except hca.util.exceptions.SwaggerAPIException as e:
# Deleting bundles is a privilege, not a right
assert e.code == 403

def test_collection_download(self):
"""
Upload a bundle, add it to a collection, and try downloading
that collection.
If we download the lone bundle in the collection that we create,
the same data should be downloaded.
"""
with self._put_test_bdl() as bdl:
col_contents = {
'type': 'bundle',
'uuid': bdl['bundle_uuid'],
'version': bdl['version']
}
put_col_args = [
'--description', '"test collection"',
'--details', '{}',
'--version', bdl['version'],
'--contents', json.dumps(col_contents),
'--name', 'collection_test:%s' % bdl['bundle_uuid']
]
with self._put_test_col(put_col_args) as col, \
tempfile.TemporaryDirectory() as t1, \
tempfile.TemporaryDirectory() as t2:
dl_col_args = ['dss', 'download-collection', '--uuid', col['uuid'],
'--replica', 'aws', '--download-dir', t1]
hca.cli.main(args=dl_col_args)
dl_bdl_args = ['dss', 'download', '--bundle-uuid',
bdl['bundle_uuid'], '--replica', 'aws',
'--download-dir', t2]
hca.cli.main(args=dl_bdl_args)
# Bundle download and collection download share the same backend,
# so shallow check is sufficient.
diff = filecmp.dircmp(t1, t2)
# It would be more concise to say `self.assertFalse(diff.right_only
# or diff.left_only or ...)` but writing it out the long way will
# make troubleshooting a failure easier.
self.assertFalse(diff.right_only)
self.assertFalse(diff.left_only)
self.assertFalse(diff.funny_files)
self.assertFalse(diff.diff_files)


if __name__ == "__main__":
unittest.main()
88 changes: 87 additions & 1 deletion test/unit/test_dss_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@
import tempfile
import threading
import unittest
import uuid

import six
from mock import patch

from hca.util.compat import walk
from hca.util.compat import USING_PYTHON2, walk
from hca.dss import DSSClient

if USING_PYTHON2:
import backports.tempfile as tempfile

logging.basicConfig()


Expand Down Expand Up @@ -492,5 +496,87 @@ def test_bundle_json(self, mock_download_file, mock_get_bundle):
actual_hash = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(actual_hash, expected_hash)

@staticmethod
def _fake_get_collection(collections):
"""Used for mocking :meth:`hca.dss.DSSClient.get_collection`"""
def func(*args, **kwargs):
for collection in collections:
if collection['uuid'] == kwargs['uuid']:
return collection
return func

@staticmethod
def _generate_col_hierarchy(depth, child_uuid=None):
"""
Generate a list of psuedo-collections such that each
collection (except for the first) is a child of its
predecessor.
"""
# If 'child_uuid' is not provided, then it's the first call,
# which means that we generate the parent ID and the child ID
# If 'child_uuid' is provided, then it's not the first call,
# which means that parent_uuid = child_uuid, and we provide a
skel = {'uuid': child_uuid if child_uuid else str(uuid.uuid4()),
'version': '2018-09-17T161441.564206Z', # arbitrary
'description': 'foo',
'details': {},
'name': 'bar',
'contents': [{
'type': 'collection',
'uuid': str(uuid.uuid4()), # overwrite if necessary
'version': '2018-09-17T161441.564206Z'}]} # arbitrary
if depth == 1:
# Base case: we don't care about the new child uuid, leave
# generated uuid in place
return [skel]
child_uuid = str(uuid.uuid4())
skel['contents'][0]['uuid'] = child_uuid
return [skel] + TestDownload._generate_col_hierarchy(depth - 1, child_uuid)

def test_collection_download_self_nested(self):
"""
If a collection contains itself, download should ignore
"extra" requests to download that collection. If this isn't
working, execution will either (1) never terminate or (2)
result in a :exc:`RuntimeError` (see
:meth:`test_collection_download_nested`).
"""
# For what it's worth, I can't find a way to create this in the
# DSS, since I can't create a collection that contains another
# collection that doesn't yet exist. (And there is no way to
# update collections after creation.) That said, this purely
# hypothetical case is handled as it is specified in #339.
test_col = self._generate_col_hierarchy(1)[0]
test_col['contents'][0]['uuid'] = test_col['uuid']
test_col['contents'][0]['version'] = test_col['version']
mock_get_col = self._fake_get_collection([test_col])
with tempfile.TemporaryDirectory() as t:
with patch('hca.dss.DSSClient.get_collection', new=mock_get_col):
self.dss.download_collection(uuid=test_col['uuid'],
replica='aws', download_dir=t)

def test_collection_download_deep(self):
"""Test that we can download nested collections"""
test_cols = self._generate_col_hierarchy(4)
test_cols[-1]['contents'][0] = {
'type': 'file',
'uuid': 'foo',
'version': 'bar'
}
mock_get_col = self._fake_get_collection(test_cols)
with tempfile.TemporaryDirectory() as t:
# Currently, we can't download files not associated with a bundle.
# When that functionality is implemented, we don't need to catch
# this RuntimeError, which is nice. (Implementing this test
# with a simulated bundle download is too much work, and tests
# the same thing - that we can parse nested collections from the
# head and reach the tail.)
with self.assertRaises(RuntimeError) as e:
with patch('hca.dss.DSSClient.get_collection', new=mock_get_col):
self.dss.download_collection(uuid=test_cols[0]['uuid'],
replica='aws', download_dir=t)
self.assertIn("download failure", e.exception.args[0])


if __name__ == "__main__":
unittest.main()

0 comments on commit 7847a6a

Please sign in to comment.