From 5deb09e5b196bb85c3669ece5976ad8fc9b570e9 Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Fri, 17 Jan 2020 10:03:04 -0500 Subject: [PATCH 1/3] post items and bitstreams --- dsaps/cli.py | 36 +++++++++++++++++++---- dsaps/models.py | 68 +++++++++++++++++++++++++++++++++++++------- tests/test_models.py | 65 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 146 insertions(+), 23 deletions(-) diff --git a/dsaps/cli.py b/dsaps/cli.py index 2f4a9f5..f2bb859 100644 --- a/dsaps/cli.py +++ b/dsaps/cli.py @@ -1,5 +1,8 @@ import datetime +import glob +import json import logging +import os import time import click @@ -69,15 +72,36 @@ def search(ctx, field, string, search_type): 'collection.') @click.option('-n', '--coll_name', prompt='Enter the name of the collection', help='The name of the collection to be created.') +@click.option('-m', '--metadata', prompt='Enter the path of the metadata file', + help='The path of the JSON file of metadata.') +@click.option('-f', '--file_path', prompt='Enter the path', + help='The path of the content, a URL or local drive path.') +@click.option('-t', '--file_type', prompt='Enter the file type', + help='The file type to be uploaded.') +@click.option('-i', '--ingest_type', prompt='Enter the type of ingest', + help='The type of ingest to perform: local, remote.', + type=click.Choice(['local', 'remote'])) @click.pass_context -def newcoll(ctx, comm_handle, coll_name): +def newcoll(ctx, comm_handle, coll_name, metadata, file_path, file_type, + ingest_type): client = ctx.obj['client'] + start_time = ctx.obj['start_time'] + coll_metadata = json.load(open(metadata)) coll_id = client.post_coll_to_comm(comm_handle, coll_name) - logger.info(coll_id) - # STEPS TO ADD - # post items to collections - # post bistreams to item_links - # post prov notes + file_dict = {} + if ingest_type == 'local': + files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True) + for file in files: + file_name = os.path.basename(file).replace(f'.{file_type}', '') + file_dict[file_name] = file + elif ingest_type == 'remote': + file_dict = models.build_file_dict_remote(file_path, file_type, + file_dict) + items = client.post_items_to_coll(coll_id, coll_metadata, file_dict, + ingest_type) + for item in items: + logger.info(f'Item posted: {item}') + models.elapsed_time(start_time, 'Total runtime:') if __name__ == '__main__': diff --git a/dsaps/models.py b/dsaps/models.py index d399d56..1ac98da 100644 --- a/dsaps/models.py +++ b/dsaps/models.py @@ -1,6 +1,7 @@ import datetime from functools import partial import operator +import os import requests import time @@ -24,6 +25,7 @@ def __init__(self, url): logger.info('Initializing client') def authenticate(self, email, password): + """Authenticate user to DSpace API.""" header = self.header data = {'email': email, 'password': password} session = requests.post(f'{self.url}/login', headers=header, @@ -54,6 +56,7 @@ def get_record(self, uuid, rec_type): def filtered_item_search(self, key, string, query_type, selected_collections=''): + """Performs a search against the filtered items endpoint.""" offset = 0 items = '' item_links = [] @@ -63,10 +66,9 @@ def filtered_item_search(self, key, string, query_type, 'query_val[]': string, '&collSel[]': selected_collections, 'limit': 200, 'offset': offset} logger.info(params) - print(endpoint) response = requests.get(endpoint, headers=self.header, params=params, cookies=self.cookies) - print(f'Response url: {response.url}') + logger.info(f'Response url: {response.url}') response = response.json() items = response['items'] for item in items: @@ -75,6 +77,7 @@ def filtered_item_search(self, key, string, query_type, return item_links def post_coll_to_comm(self, comm_handle, coll_name): + """Posts a collection to a specified community.""" endpoint = f'{self.url}/handle/{comm_handle}' community = requests.get(endpoint, headers=self.header, cookies=self.cookies).json() @@ -83,7 +86,52 @@ def post_coll_to_comm(self, comm_handle, coll_name): endpoint2 = f'{self.url}/communities/{comm_id}/collections' coll_id = requests.post(endpoint2, headers=self.header, cookies=self.cookies, json=collection).json() - return coll_id['link'] + coll_id = coll_id['uuid'] + logger.info(f'Collection posted: {coll_id}') + return coll_id + + def post_items_to_coll(self, coll_id, coll_metadata, file_dict, + ingest_type): + """Posts items to a specified collection.""" + for item_metadata in coll_metadata: + file_exists = '' + for element in [e for e in item_metadata['metadata'] + if e['key'] == 'file_identifier']: + file_identifier = element['value'] + item_metadata['metadata'].remove(element) + for k in [e for e in file_dict if file_identifier in e]: + file_exists = True + if file_exists is True: + endpoint = f'{self.url}/collections/{coll_id}/items' + item_id = requests.post(endpoint, headers=self.header, + cookies=self.cookies, + json=item_metadata).json() + item_id = item_id['uuid'] + bit_ids = self.post_bitstreams_to_item(item_id, + file_identifier, + file_dict, ingest_type) + for bit_id in bit_ids: + logger.info(f'Bitstream posted: {bit_id}') + yield item_id + + def post_bitstreams_to_item(self, item_id, file_identifier, file_dict, + ingest_type): + """Posts bitstreams to a specified item.""" + for k, v in file_dict.items(): + if k.startswith(file_identifier): + bitstream = file_dict[k] + file_name = os.path.basename(bitstream) + if ingest_type == 'local': + data = open(bitstream, 'rb') + elif ingest_type == 'remote': + data = requests.get(bitstream) + endpoint = (f'{self.url}/items/{item_id}' + + f'/bitstreams?name={file_name}') + header_upload = {'accept': 'application/json'} + bit_id = requests.post(endpoint, headers=header_upload, + cookies=self.cookies, data=data).json() + bit_id = bit_id['uuid'] + yield bit_id def _pop_inst(self, class_type, rec_obj): """Populate class instance with data from record.""" @@ -100,6 +148,7 @@ def _pop_inst(self, class_type, rec_obj): return rec_obj def _build_uuid_list(self, rec_obj, children): + """Builds a list of the uuids for an object's children.""" child_list = [] for child in rec_obj[children]: child_list.append(child['uuid']) @@ -138,15 +187,14 @@ class MetadataEntry(BaseRecord): language = Field() -def build_file_list_remote(directory_url, file_extension): - """Build list of files in local directory.""" - file_list = {} +def build_file_dict_remote(directory_url, file_type, file_dict): + """Build list of files in a remote directory.""" response = requests.get(directory_url) links = html.fromstring(response.content).iterlinks() - for link in links: - if link[2].endswith(file_extension): - file_list[link[2]] = f'{directory_url}{link[2]}' - return file_list + for link in [l for l in links if l[2].endswith(file_type)]: + file_identifier = link[2].replace(f'.{file_type}', '') + file_dict[file_identifier] = f'{directory_url}{link[2]}' + return file_dict def elapsed_time(start_time, label): diff --git a/tests/test_models.py b/tests/test_models.py index ba41103..cd89292 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -14,6 +14,16 @@ def client(): return client +@pytest.fixture +def sample_content(tmp_path): + content = 'test' + dir = tmp_path / 'sub' + dir.mkdir() + sample_content = dir / '123.pdf' + sample_content.write_text(content) + return sample_content + + def test_authenticate(client): """Test authenticate function.""" with requests_mock.Mocker() as m: @@ -55,11 +65,12 @@ def test_filtered_item_search(client): def test_post_coll_to_comm(client): + """Test post_coll_to_comm function.""" with requests_mock.Mocker() as m: comm_handle = '1234' coll_name = 'Test Collection' json_object_1 = {'uuid': 'a1b2'} - json_object_2 = {'link': '5678'} + json_object_2 = {'uuid': '5678'} m.get('mock://example.com/handle/1234', json=json_object_1) m.post('mock://example.com/communities/a1b2/collections', json=json_object_2) @@ -67,6 +78,45 @@ def test_post_coll_to_comm(client): assert coll_id == '5678' +def test_post_items_to_coll(client, sample_content): + """Test post_items_to_coll function.""" + with requests_mock.Mocker() as m: + coll_metadata = [{"metadata": [ + {"key": "file_identifier", + "value": "123"}, + {"key": "dc.title", "value": + "Monitoring Works: Getting Teachers", + "language": "en_US"}]}] + coll_id = '789' + ingest_type = 'local' + file_dict = {'123': sample_content} + json_object_1 = {'uuid': 'a1b2'} + m.post('mock://example.com/collections/789/items', json=json_object_1) + url = 'mock://example.com/items/a1b2/bitstreams?name=123.pdf' + json_object_2 = {'uuid': 'c3d4'} + m.post(url, json=json_object_2) + item_ids = client.post_items_to_coll(coll_id, coll_metadata, file_dict, + ingest_type) + for item_id in item_ids: + assert 'a1b2' == item_id + + +def test_post_bitstreams_to_item(client, sample_content): + """Test post_bitstreams_to_item function.""" + with requests_mock.Mocker() as m: + item_id = 'a1b2' + ingest_type = 'local' + file_identifier = '123' + file_dict = {'123': sample_content} + json_object_1 = {'uuid': 'c3d4'} + url = 'mock://example.com/items/a1b2/bitstreams?name=123.pdf' + m.post(url, json=json_object_1) + bit_ids = client.post_bitstreams_to_item(item_id, file_identifier, + file_dict, ingest_type) + for bit_id in bit_ids: + assert 'c3d4' == bit_id + + def test__pop_inst(client): """Test _pop_inst function.""" class_type = models.Collection @@ -84,8 +134,8 @@ def test__build_uuid_list(client): assert '1234' in child_list -def test_build_file_list_remote(): - """Test build_file_list_remote function.""" +def test_build_file_dict_remote(): + """Test build_file_dict_remote function.""" content = '' content += 'Index of /pdf

Index of /' content += 'pdf

NameLast modified' @@ -93,8 +143,9 @@ def test_build_file_list_remote(): content += '2001-02-16 11:59 107K
' with requests_mock.Mocker() as m: directory_url = 'mock://test.com/pdfs/' - file_extension = 'pdf' + file_type = 'pdf' + file_dict = {} m.get(directory_url, text=content) - file_list = models.build_file_list_remote(directory_url, - file_extension) - assert '999.pdf' in file_list + file_list = models.build_file_dict_remote(directory_url, file_type, + file_dict) + assert '999' in file_list From 64313dffa6035aa49bddc77cb2f6158ebff8a894 Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Mon, 3 Feb 2020 13:34:05 -0500 Subject: [PATCH 2/3] Update cli.py --- dsaps/cli.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/dsaps/cli.py b/dsaps/cli.py index f2bb859..988c355 100644 --- a/dsaps/cli.py +++ b/dsaps/cli.py @@ -86,21 +86,22 @@ def newcoll(ctx, comm_handle, coll_name, metadata, file_path, file_type, ingest_type): client = ctx.obj['client'] start_time = ctx.obj['start_time'] - coll_metadata = json.load(open(metadata)) - coll_id = client.post_coll_to_comm(comm_handle, coll_name) - file_dict = {} - if ingest_type == 'local': - files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True) - for file in files: - file_name = os.path.basename(file).replace(f'.{file_type}', '') - file_dict[file_name] = file - elif ingest_type == 'remote': - file_dict = models.build_file_dict_remote(file_path, file_type, - file_dict) - items = client.post_items_to_coll(coll_id, coll_metadata, file_dict, - ingest_type) - for item in items: - logger.info(f'Item posted: {item}') + with open(metadata, encoding='UTF-8') as fp: + coll_metadata = json.load(fp) + coll_id = client.post_coll_to_comm(comm_handle, coll_name) + file_dict = {} + if ingest_type == 'local': + files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True) + for file in files: + file_name = os.path.splitext(file)[0][file.rindex('/') + 1:] + file_dict[file_name] = file + elif ingest_type == 'remote': + file_dict = models.build_file_dict_remote(file_path, file_type, + file_dict) + items = client.post_items_to_coll(coll_id, coll_metadata, file_dict, + ingest_type) + for item in items: + logger.info(f'Item posted: {item}') models.elapsed_time(start_time, 'Total runtime:') From d9d50e1d55a939e3e8491dbd62e0b121703fcaf2 Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Wed, 5 Feb 2020 09:36:50 -0500 Subject: [PATCH 3/3] Update cli.py --- dsaps/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsaps/cli.py b/dsaps/cli.py index 988c355..0e30c12 100644 --- a/dsaps/cli.py +++ b/dsaps/cli.py @@ -93,7 +93,7 @@ def newcoll(ctx, comm_handle, coll_name, metadata, file_path, file_type, if ingest_type == 'local': files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True) for file in files: - file_name = os.path.splitext(file)[0][file.rindex('/') + 1:] + file_name = os.path.splitext(os.path.basename(file))[0] file_dict[file_name] = file elif ingest_type == 'remote': file_dict = models.build_file_dict_remote(file_path, file_type,