diff --git a/dsaps/cli.py b/dsaps/cli.py index e045a55..ac79117 100644 --- a/dsaps/cli.py +++ b/dsaps/cli.py @@ -14,6 +14,14 @@ logger = structlog.get_logger() +def validate_path(ctx, param, value): + """Validates th formatting of The submitted path""" + if value[-1] == '/': + return value + else: + raise click.BadParameter('Include / at the end of the path.') + + @click.group(chain=True) @click.option('--url', envvar='DSPACE_URL', required=True,) @click.option('-e', '--email', envvar='TEST_EMAIL', required=True, @@ -51,11 +59,13 @@ def main(ctx, url, email, password): @main.command() @click.option('-m', '--metadata-csv', required=True, - type=click.Path(exists=True), - help='The full path to the CSV file of metadata for the items.') -@click.option('--field-map', required=True, type=click.Path(exists=True), - help='Path to JSON field mapping file') -@click.option('-d', '--directory', required=True, + type=click.Path(exists=True, file_okay=True, dir_okay=False), + help='The path to the CSV file of metadata for the items.') +@click.option('--field-map', required=True, + type=click.Path(exists=True, file_okay=True, dir_okay=False), + help='The path to JSON field mapping file.') +@click.option('-d', '--content-directory', required=True, + type=click.Path(exists=True, dir_okay=True, file_okay=False), help='The full path to the content, either a directory of files ' 'or a URL for the storage location.') @click.option('-t', '--file-type', @@ -67,11 +77,11 @@ def main(ctx, url, email, password): help='The handle of the collection to which items are being ' 'added.', default=None) @click.pass_context -def additems(ctx, metadata_csv, field_map, directory, file_type, ingest_report, - collection_handle): +def additems(ctx, metadata_csv, field_map, content_directory, file_type, + ingest_report, collection_handle): """Adds items to a specified collection from a metadata CSV, a field mapping file, and a directory of files. May be run in conjunction with the - newcollection CLI commands.""" + newcollection CLI command.""" client = ctx.obj['client'] start_time = ctx.obj['start_time'] if 'collection_uuid' not in ctx.obj and collection_handle is None: @@ -87,7 +97,7 @@ def additems(ctx, metadata_csv, field_map, directory, file_type, ingest_report, mapping = json.load(jsonfile) collection = Collection.from_csv(metadata, mapping) for item in collection.items: - item.bitstreams_from_directory(directory, file_type) + item.bitstreams_from_directory(content_directory, file_type) collection.uuid = collection_uuid items = collection.post_items(client) if ingest_report: @@ -114,20 +124,38 @@ def newcollection(ctx, community_handle, collection_name): ctx.obj['collection_uuid'] = collection_uuid -# @main.command() -# @click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file', -# help='The path of the CSV file of metadata.') -# @click.option('-o', '--output_path', prompt='Enter the output path', -# default='', help='The path of the output files, include ' -# '/ at the end of the path') -# @click.option('-f', '--file_path', prompt='Enter the path', -# help='The path of the content, a URL or local drive path.' -# 'Include / at the end of a local drive path.') -# @click.option('-t', '--file_type', prompt='Enter the file type', -# help='The file type to be uploaded.') -# def reconcile(metadata_csv, file_path, file_type, output_path): -# workflows.reconcile_files_and_metadata(metadata_csv, output_path, -# file_path, file_type) +@main.command() +@click.option('-m', '--metadata-csv', required=True, + type=click.Path(exists=True, file_okay=True, dir_okay=False), + help='The path of the CSV file of metadata.') +@click.option('-o', '--output-directory', + type=click.Path(exists=True, file_okay=False), + default=f'{os.getcwd()}/', callback=validate_path, + help='The path of the output files, include / at the end of the ' + 'path.') +@click.option('-d', '--content-directory', required=True, + help='The full path to the content, either a directory of files ' + 'or a URL for the storage location.') +@click.option('-t', '--file-type', + help='The file type to be uploaded, if limited to one file ' + 'type.', default='*') +def reconcile(metadata_csv, output_directory, content_directory, file_type): + """Runs a reconciliation of the specified files and metadata that produces + reports of files with no metadata, metadata with no files, metadata + matched to files, and an updated version of the metadata CSV with only + the records that have matching files.""" + file_ids = helpers.create_file_list(content_directory, file_type) + metadata_ids = helpers.create_metadata_id_list(metadata_csv) + metadata_matches = helpers.match_metadata_to_files(file_ids, metadata_ids) + file_matches = helpers.match_files_to_metadata(file_ids, metadata_ids) + no_files = set(metadata_ids) - set(metadata_matches) + no_metadata = set(file_ids) - set(file_matches) + helpers.create_csv_from_list(no_metadata, f'{output_directory}no_metadata') + helpers.create_csv_from_list(no_files, f'{output_directory}no_files') + helpers.create_csv_from_list(metadata_matches, + f'{output_directory}metadata_matches') + helpers.update_metadata_csv(metadata_csv, output_directory, + metadata_matches) if __name__ == '__main__': diff --git a/dsaps/helpers.py b/dsaps/helpers.py index 15dabc7..c8f4fbd 100644 --- a/dsaps/helpers.py +++ b/dsaps/helpers.py @@ -2,11 +2,6 @@ import glob import os -import structlog - - -logger = structlog.get_logger() - def create_csv_from_list(list_name, output): """Creates CSV file from list content.""" @@ -17,14 +12,11 @@ def create_csv_from_list(list_name, output): writer.writerow([item]) -def create_file_dict(file_path, file_type): - """Creates a dict of file IDs and file paths.""" +def create_file_list(file_path, file_type): + """Creates a list of file names.""" files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True) - file_dict = {} - for file in files: - file_name = os.path.splitext(os.path.basename(file))[0] - file_dict[file_name] = file - return file_dict + file_list = [os.path.basename(file) for file in files] + return file_list def create_ingest_report(items, file_name): @@ -43,37 +35,32 @@ def create_metadata_id_list(metadata_csv): metadata_ids = [] with open(metadata_csv) as csvfile: reader = csv.DictReader(csvfile) - for row in [r for r in reader if r['file_identifier'] != '']: - metadata_ids.append(row['file_identifier']) + metadata_ids = [row['file_identifier'] for row in reader + if row['file_identifier'] != ''] return metadata_ids -def match_files_to_metadata(file_dict, metadata_ids): +def match_files_to_metadata(file_list, metadata_ids): """Creates a list of files matched to metadata records.""" - file_matches = [] - for file_id, v in file_dict.items(): - for metadata_id in [m for m in metadata_ids - if file_id.startswith(m)]: - file_matches.append(file_id) + file_matches = [file_id for metadata_id in metadata_ids + for file_id in file_list + if file_id.startswith(metadata_id)] return file_matches -def match_metadata_to_files(file_dict, metadata_ids): +def match_metadata_to_files(file_list, metadata_ids): """Creates a list of metadata records matched to files.""" - metadata_matches = [] - for metadata_id in metadata_ids: - for file_id in [f for f in file_dict - if f.startswith(metadata_id)]: - metadata_matches.append(metadata_id) + metadata_matches = [metadata_id for f in file_list for metadata_id in + metadata_ids if f.startswith(metadata_id)] return metadata_matches -def update_metadata_csv(metadata_csv, output_path, metadata_matches): +def update_metadata_csv(metadata_csv, output_directory, metadata_matches): """Creates an updated CSV of metadata records with matching files.""" with open(metadata_csv) as csvfile: reader = csv.DictReader(csvfile) upd_md_file_name = f'updated-{os.path.basename(metadata_csv)}' - with open(f'{output_path}{upd_md_file_name}', 'w') as updated_csv: + with open(f'{output_directory}{upd_md_file_name}', 'w') as updated_csv: writer = csv.DictWriter(updated_csv, fieldnames=reader.fieldnames) writer.writeheader() for row in reader: diff --git a/dsaps/metadata.py b/dsaps/metadata.py deleted file mode 100644 index dd59fdf..0000000 --- a/dsaps/metadata.py +++ /dev/null @@ -1,78 +0,0 @@ -import csv - - -def create_json_metadata(metadata_csv, multiple_terms): - """Creates JSON metadata from a CSV.""" - with open(metadata_csv) as csvfile: - reader = csv.DictReader(csvfile) - metadata_group = [] - # WE SHOULD DISCUSS HOW TO HANDLE MAPPING DICT CHANGES - mapping_dict = {'file_identifier': ['file_identifier'], - 'dc.title': ['title', 'en_US'], - 'dc.relation.isversionof': ['uri'], - 'dc.contributor.author': ['authors', None, '|'] - } - for row in reader: - metadata_rec = [] - if multiple_terms == 'delimited': - metadata_rec = create_metadata_rec_delim(mapping_dict, row, - metadata_rec) - else: - metadata_rec = create_metadata_rec_num_col(row, metadata_rec) - item = {'metadata': metadata_rec} - metadata_group.append(item) - return metadata_group - - -def create_metadata_rec_delim(mapping_dict, row, metadata_rec): - """Uses a mapping dict to create a metadata record from a series of metadata - elements.""" - for k, v in mapping_dict.items(): - if len(v) == 3: - metadata_elems = metadata_elems_from_row(row, k, v[0], v[1], v[2]) - elif len(v) == 2: - metadata_elems = metadata_elems_from_row(row, k, v[0], v[1]) - else: - metadata_elems = metadata_elems_from_row(row, k, v[0]) - for metadata_elem in metadata_elems: - metadata_rec.append(metadata_elem) - return metadata_rec - - -def create_metadata_rec_num_col(row, metadata_rec): - """Uses a CSV that contains DC property column names and numbered columns - for multiple terms to create a metadata record from a series of metadata - elements.""" - for csv_key, csv_value in row.items(): - if csv_value is not None: - if csv_key[-1].isdigit(): - dc_key = csv_key[:-2] - else: - dc_key = csv_key - # THE FIELDS THAT SHOULDN'T RECEIVE A LANG TAG IS ALSO LIKELY - # CHANGE WITH THE MAPPING DICT - if dc_key not in ['dc.contributor.author', 'file_identifier', - 'dc.relation.isversionof', 'dc.date.issued']: - metadata_elems = metadata_elems_from_row(row, dc_key, csv_key, - 'en_US') - else: - metadata_elems = metadata_elems_from_row(row, dc_key, csv_key) - for metadata_elem in metadata_elems: - metadata_rec.append(metadata_elem) - return metadata_rec - - -def metadata_elems_from_row(row, key, field, language=None, delimiter=''): - """Create a metadata element from a CSV row.""" - metadata_elems = [] - if row[field] != '': - if delimiter: - values = row[field].split(delimiter) - else: - values = [row[field]] - for value in values: - metadata_elem = {'key': key, 'language': language, 'value': - value} - metadata_elems.append({k: v for k, v in metadata_elem.items() - if v is not None}) - return metadata_elems diff --git a/tests/conftest.py b/tests/conftest.py index c247b1f..fddeb8a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import pytest import requests_mock -from dsaps import metadata, models +from dsaps import models @pytest.fixture() @@ -41,14 +41,6 @@ def aspace_delimited_csv(): yield reader -@pytest.fixture() -def json_metadata_delim(): - json_metadata = metadata.create_json_metadata( - 'tests/fixtures/metadata_delim.csv', 'delimited' - ) - return json_metadata - - @pytest.fixture() def aspace_mapping(): with open('config/aspace_mapping.json') as f: @@ -63,14 +55,6 @@ def standard_mapping(): yield mapping -@pytest.fixture() -def json_metadata_num_col(): - json_metadata = metadata.create_json_metadata( - 'tests/fixtures/metadata_num_col.csv', 'num_columns' - ) - return json_metadata - - @pytest.fixture() def output_dir(tmp_path): output_dir = tmp_path / 'output' @@ -84,7 +68,7 @@ def runner(): @pytest.fixture(autouse=True) -def web_mock(input_dir): +def web_mock(): with requests_mock.Mocker() as m: cookies = {'JSESSIONID': '11111111'} m.post('mock://example.com/login', cookies=cookies) diff --git a/tests/test_cli.py b/tests/test_cli.py index f2fb973..937ba80 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,7 +12,7 @@ def test_additems(runner, input_dir): 'tests/fixtures/metadata_delim.csv', '--field-map', 'tests/fixtures/standard_mapping.json', - '--directory', input_dir, + '--content-directory', input_dir, '--file-type', 'pdf', '--collection-handle', '333.3333']) assert result.exit_code == 0 @@ -28,7 +28,7 @@ def test_additems(runner, input_dir): 'tests/fixtures/metadata_delim.csv', '--field-map', 'tests/fixtures/standard_mapping.json', - '--directory', input_dir, + '--content-directory', input_dir, '--file-type', 'pdf']) assert result.exit_code == 0 @@ -45,16 +45,17 @@ def test_newcollection(runner, input_dir): assert result.exit_code == 0 -# def test_reconcile(runner, input_dir, output_dir): -# """Test reconcile command.""" -# result = runner.invoke(main, -# ['--url', 'mock://example.com/', -# '--email', 'test@test.mock', -# '--password', '1234', -# 'reconcile', -# '--metadata_csv', 'tests/fixtures/metadata_delim.csv', -# '--file_path', input_dir, -# '--file_type', 'pdf', -# '--output_path', output_dir -# ]) -# assert result.exit_code == 0 +def test_reconcile(runner, input_dir, output_dir): + """Test reconcile command.""" + result = runner.invoke(main, + ['--url', 'mock://example.com/', + '--email', 'test@test.mock', + '--password', '1234', + 'reconcile', + '--metadata-csv', + 'tests/fixtures/metadata_delim.csv', + '--output-directory', output_dir, + '--content-directory', input_dir, + '--file-type', 'pdf' + ]) + assert result.exit_code == 0 diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 867fd38..f00f9f5 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -3,26 +3,6 @@ from dsaps import helpers from dsaps.models import Item -# from dsaps.helpers import files_from_location -# -# -# def test_file_list_from_location_with_file_type(input_dir): -# files = files_from_location(input_dir, 'pdf') -# assert 3 == len(files) -# assert {'name': 'test_01', 'path': f'{input_dir}test_01.pdf'} in files -# assert {'name': 'test_02', -# 'path': f'{input_dir}more_files/test_02.pdf'} in files -# -# -# def test_file_list_from_location_without_file_type(input_dir): -# files = files_from_location(input_dir) -# assert 4 == len(files) -# assert {'name': 'test_01', 'path': f'{input_dir}test_01.pdf'} in files -# assert {'name': 'test_02', -# 'path': f'{input_dir}more_files/test_02.pdf'} in files -# assert {'name': 'test_01', 'path': f'{input_dir}test_01.jpg'} in files -# - def test_create_csv_from_list(output_dir): """Test create_csv_from_list function.""" @@ -34,12 +14,11 @@ def test_create_csv_from_list(output_dir): assert row['id'] == '123' -def test_create_file_dict(input_dir): - """Test create_file_dict function.""" - file_dict = helpers.create_file_dict(input_dir, 'pdf') - assert file_dict['test_02'] == f'{input_dir}more_files/test_02.pdf' - assert file_dict['test_01'] == f'{input_dir}test_01.pdf' - assert file_dict['best_01'] == f'{input_dir}best_01.pdf' +def test_create_file_list(input_dir): + """Test create_file_list function.""" + file_list = helpers.create_file_list(input_dir, 'pdf') + for file_id in ['test_02.pdf', 'test_01.pdf', 'best_01.pdf']: + assert file_id in file_list def test_create_ingest_report(runner, output_dir): @@ -67,19 +46,18 @@ def test_create_metadata_id_list(input_dir): def test_match_files_to_metadata(): """Test match_files_to_metadata function.""" - file_dict = {'test_01': 'files/test_01.pdf'} + file_list = ['test_01.pdf'] metadata_ids = ['test', 'tast'] - file_matches = helpers.match_files_to_metadata(file_dict, metadata_ids) + file_matches = helpers.match_files_to_metadata(file_list, metadata_ids) assert len(file_matches) == 1 - assert 'test_01' in file_matches + assert 'test_01.pdf' in file_matches def test_match_metadata_to_files(): """Test match_metadata_to_files function.""" - file_dict = {'test_01': 'files/test_01.pdf', - 'tast_01': 'files/tast_01.pdf'} + file_list = ['test_01.pdf', 'tast_01.pdf'] metadata_ids = ['test'] - file_matches = helpers.match_metadata_to_files(file_dict, metadata_ids) + file_matches = helpers.match_metadata_to_files(file_list, metadata_ids) assert len(file_matches) == 1 assert 'test' in file_matches diff --git a/tests/test_metadata.py b/tests/test_metadata.py deleted file mode 100644 index 96aaefa..0000000 --- a/tests/test_metadata.py +++ /dev/null @@ -1,61 +0,0 @@ -import csv - -from dsaps import metadata - - -def test_create_json_metadata(input_dir, json_metadata_delim): - """Test create_json_metadata function.""" - md_group = metadata.create_json_metadata('tests/fixtures/metadata_delim.csv', - 'delimited') - assert md_group[0]['metadata'] == json_metadata_delim[0]['metadata'] - assert md_group[1]['metadata'] == json_metadata_delim[1]['metadata'] - - -def test_create_metadata_rec_delim(json_metadata_delim): - """Test create_metadata_rec function.""" - mapping_dict = {'file_identifier': ['file_identifier'], - 'dc.title': ['title', 'en_US'], - 'dc.relation.isversionof': ['uri'], - 'dc.contributor.author': ['authors', None, '|']} - with open('tests/fixtures/metadata_delim.csv') as csvfile: - reader = csv.DictReader(csvfile) - metadata_rec_1 = metadata.create_metadata_rec_delim(mapping_dict, - next(reader), []) - assert metadata_rec_1 == json_metadata_delim[0]['metadata'] - metadata_rec_2 = metadata.create_metadata_rec_delim(mapping_dict, - next(reader), []) - assert metadata_rec_2 == json_metadata_delim[1]['metadata'] - - -def test_create_metadata_rec_num_col(json_metadata_num_col): - """Test create_metadata_rec_num_col function.""" - with open('tests/fixtures/metadata_num_col.csv') as csvfile: - reader = csv.DictReader(csvfile) - metadata_rec = metadata.create_metadata_rec_num_col(next(reader), []) - assert metadata_rec == json_metadata_num_col[0]['metadata'] - - -def test_metadata_elems_from_row(): - """Test metadata_elems_from_row function.""" - row = {'title': 'Test title'} - metadata_elem = metadata.metadata_elems_from_row(row, 'dc.title', 'title', - 'en_US') - assert metadata_elem[0]['key'] == 'dc.title' - assert metadata_elem[0]['value'] == 'Test title' - assert metadata_elem[0]['language'] == 'en_US' - metadata_elem = metadata.metadata_elems_from_row(row, 'dc.title', 'title') - assert metadata_elem[0]['key'] == 'dc.title' - assert metadata_elem[0]['value'] == 'Test title' - assert 'language' not in metadata_elem[0] - row = {'title': ''} - metadata_elem = metadata.metadata_elems_from_row(row, 'dc.title', 'title') - assert metadata_elem == [] - row = {'title': 'Test title 1|Test title 2'} - metadata_elem = metadata.metadata_elems_from_row(row, 'dc.title', 'title', - 'en_US', '|') - assert metadata_elem[0]['key'] == 'dc.title' - assert metadata_elem[0]['value'] == 'Test title 1' - assert metadata_elem[0]['language'] == 'en_US' - assert metadata_elem[1]['key'] == 'dc.title' - assert metadata_elem[1]['value'] == 'Test title 2' - assert metadata_elem[1]['language'] == 'en_US'