Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ local/*
!data/.keep
.profile
*.csv
!tests/fixtures/*.csv
*.json
!config/*.json
!tests/fixtures/*.json
createItemMetadataFromCSV_*
*.txt
243 changes: 122 additions & 121 deletions Pipfile.lock

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions config/aspace_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"file_identifier": {
"csv_field_name": "file_identifier",
"language": null,
"delimiter": ""
},
"dc.title": {
"csv_field_name": "title",
"language": "en_US",
"delimiter": ""
},
"dc.relation.isversionof": {
"csv_field_name": "uri",
"language": null,
"delimiter": ""
}
}
172 changes: 79 additions & 93 deletions dsaps/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import csv
import datetime
import glob
import json
import logging
import os
Expand All @@ -9,18 +8,18 @@
import click
import structlog

from dsaps import models, workflows
from dsaps.models import Client, Collection
from dsaps import helpers

logger = structlog.get_logger()


@click.group()
@click.option('--url', envvar='DSPACE_URL')
@click.option('-e', '--email', prompt='Enter email',
@click.group(chain=True)
@click.option('--url', envvar='DSPACE_URL', required=True,)
@click.option('-e', '--email', envvar='TEST_EMAIL', required=True,
help='The email of the user for authentication.')
@click.option('-p', '--password', prompt='Enter password',
envvar='TEST_PASS', hide_input=True,
help='The password for authentication.')
@click.option('-p', '--password', envvar='TEST_PASS', required=True,
hide_input=True, help='The password for authentication.')
@click.pass_context
def main(ctx, url, email, password):
ctx.obj = {}
Expand All @@ -42,106 +41,93 @@ def main(ctx, url, email, password):
'w')],
level=logging.INFO)
logger.info('Application start')
client = models.Client(url)
client = Client(url)
client.authenticate(email, password)
start_time = time.time()
ctx.obj['client'] = client
ctx.obj['start_time'] = start_time
ctx.obj['log_suffix'] = log_suffix


@main.command()
@click.option('-c', '--comm_handle', prompt='Enter the community handle',
help='The handle of the community in which to create the ,'
'collection.')
@click.option('-n', '--coll_name', prompt='Enter the name of the collection',
help='The name of the collection to be created.')
@click.option('-m', '--metadata', prompt='Enter the path of the metadata file',
help='The path of the JSON file of metadata.')
@click.option('-f', '--file_path', prompt='Enter the path',
help='The path of the content, a URL or local drive path.')
@click.option('-t', '--file_type', prompt='Enter the file type',
help='The file type to be uploaded.')
@click.option('-i', '--ingest_type', prompt='Enter the type of ingest',
help='The type of ingest to perform: local, remote.',
type=click.Choice(['local', 'remote']))
@click.option('-m', '--metadata-csv', required=True,
type=click.Path(exists=True),
help='The full path to the CSV file of metadata for the items.')
@click.option('--field-map', required=True, type=click.Path(exists=True),
help='Path to JSON field mapping file')
@click.option('-d', '--directory', required=True,
help='The full path to the content, either a directory of files '
'or a URL for the storage location.')
@click.option('-t', '--file-type',
help='The file type to be uploaded, if limited to one file '
'type.', default='*')
@click.option('-r', '--ingest-report', is_flag=True,
help='Create ingest report for updating other systems.')
@click.option('-c', '--collection-handle',
help='The handle of the collection to which items are being '
'added.', default=None)
@click.pass_context
def newcoll(ctx, comm_handle, coll_name, metadata, file_path, file_type,
ingest_type):
def additems(ctx, metadata_csv, field_map, directory, file_type, ingest_report,
collection_handle):
"""Adds items to a specified collection from a metadata CSV, a field
mapping file, and a directory of files. May be run in conjunction with the
newcollection CLI commands."""
client = ctx.obj['client']
start_time = ctx.obj['start_time']
with open(metadata, encoding='UTF-8') as fp:
coll_metadata = json.load(fp)
coll_id = client.post_coll_to_comm(comm_handle, coll_name)
file_dict = {}
if ingest_type == 'local':
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
for file in files:
file_name = os.path.splitext(os.path.basename(file))[0]
file_dict[file_name] = file
elif ingest_type == 'remote':
file_dict = models.build_file_dict_remote(file_path, file_type,
file_dict)
items = client.post_items_to_coll(coll_id, coll_metadata, file_dict,
ingest_type)
for item in items:
logger.info(f'Item posted: {item}')
models.elapsed_time(start_time, 'Total runtime:')
if 'collection_uuid' not in ctx.obj and collection_handle is None:
raise click.UsageError('collection_handle option must be used or '
'additems must be run after newcollection '
'command.')
elif 'collection_uuid' in ctx.obj:
collection_uuid = ctx.obj['collection_uuid']
else:
collection_uuid = client.get_uuid_from_handle(collection_handle)
with open(metadata_csv, 'r') as csvfile, open(field_map, 'r') as jsonfile:
metadata = csv.DictReader(csvfile)
mapping = json.load(jsonfile)
collection = Collection.from_csv(metadata, mapping)
for item in collection.items:
item.bitstreams_from_directory(directory, file_type)
collection.uuid = collection_uuid
items = collection.post_items(client)
if ingest_report:
report_name = metadata_csv.replace('.csv', '-ingest.csv')
helpers.create_ingest_report(items, report_name)
elapsed_time = datetime.timedelta(seconds=time.time() - start_time)
logger.info(f'Total runtime : {elapsed_time}')


@main.command()
@click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
help='The path of the CSV file of metadata.')
@click.option('-o', '--output_path', prompt='Enter the output path',
default='', help='The path of the output files, include '
'/ at the end of the path')
@click.option('-f', '--file_path', prompt='Enter the path',
help='The path of the content, a URL or local drive path.'
'Include / at the end of a local drive path.')
@click.option('-t', '--file_type', prompt='Enter the file type',
help='The file type to be uploaded.')
def reconcile(metadata_csv, file_path, file_type, output_path):
workflows.reconcile_files_and_metadata(metadata_csv, output_path,
file_path, file_type)
@click.option('-c', '--community-handle', required=True,
help='The handle of the community in which to create the ,'
'collection.')
@click.option('-n', '--collection-name', required=True,
help='The name of the collection to be created.')
@click.pass_context
def newcollection(ctx, community_handle, collection_name):
"""Posts a new collection to a specified community. Used in conjunction
with the additems CLI command to populate the new collection with
items."""
client = ctx.obj['client']
collection_uuid = client.post_coll_to_comm(community_handle,
collection_name)
ctx.obj['collection_uuid'] = collection_uuid


@main.command()
@click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
help='The path of the CSV file of metadata.')
def metadatajson(metadata_csv):
with open(metadata_csv) as csvfile:
reader = csv.DictReader(csvfile)
metadata_group = []
mapping_dict = {'fileIdentifier': ['file_identifier'],
'dc.contributor.author': ['author name - direct'],
'dc.contributor.advisor': ['supervisor(s)'],
'dc.date.issued': ['pub date'],
'dc.description.abstract': ['Abstract', 'en_US'],
'dc.title': ['Title', 'en_US'],
'dc.relation.ispartofseries': ['file_identifier']}
for row in reader:
metadata_rec = []
metadata_rec = models.create_metadata_rec(mapping_dict, row,
metadata_rec)
metadata_rec.append({'key': 'dc.format.mimetype', 'language':
'en_US', 'value': 'application/pdf'})
metadata_rec.append({'key': 'dc.language.iso', 'language':
'en_US', 'value': 'en_US'})
metadata_rec.append({'key': 'dc.publisher', 'language': 'en_US',
'value': 'Massachusetts Institute of '
'Technology. Laboratory for Computer'
'Science'})
metadata_rec.append({'key': 'dc.rights', 'language': 'en_US',
'value': 'Educational use permitted'})
metadata_rec.append({'key': 'dc.rights.uri', 'language': 'en_US',
'value': 'http://rightsstatements.org/vocab/'
'InC-EDU/1.0/'})
metadata_rec.append({'key': 'dc.type', 'language': 'en_US',
'value': 'Technical Report'})
item = {'metadata': metadata_rec}
metadata_group.append(item)
file_name = os.path.splitext(os.path.basename(metadata_csv))[0]
with open(f'{file_name}.json', 'w') as f:
json.dump(metadata_group, f)
# @main.command()
# @click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
# help='The path of the CSV file of metadata.')
# @click.option('-o', '--output_path', prompt='Enter the output path',
# default='', help='The path of the output files, include '
# '/ at the end of the path')
# @click.option('-f', '--file_path', prompt='Enter the path',
# help='The path of the content, a URL or local drive path.'
# 'Include / at the end of a local drive path.')
# @click.option('-t', '--file_type', prompt='Enter the file type',
# help='The file type to be uploaded.')
# def reconcile(metadata_csv, file_path, file_type, output_path):
# workflows.reconcile_files_and_metadata(metadata_csv, output_path,
# file_path, file_type)


if __name__ == '__main__':
Expand Down
60 changes: 31 additions & 29 deletions dsaps/workflows.py → dsaps/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,49 @@
import glob
import os

from dsaps import models
import structlog


logger = structlog.get_logger()


def create_csv_from_list(list_name, output):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's go ahead and delete all the helper functions that aren't being used anymore. We always have old code to go back and look at if we need the logic.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's left are functions related to the reconcile command which I will clean up in the next PR as I refactor that process

"""Creates CSV file from list content."""
with open(f'{output}.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['id'])
for item in list_name:
writer.writerow([item])


def create_file_dict(file_path, file_type):
"""Creates a dict of file IDs and file paths."""
if file_path.startswith('http'):
file_dict = models.build_file_dict_remote(file_path, file_type, {})
else:
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
file_dict = {}
for file in files:
file_name = os.path.splitext(os.path.basename(file))[0]
file_dict[file_name] = file
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
file_dict = {}
for file in files:
file_name = os.path.splitext(os.path.basename(file))[0]
file_dict[file_name] = file
return file_dict


def create_ingest_report(items, file_name):
"""Creates ingest report of other systems' identifiers with a newly created
DSpace handle."""
with open(f'{file_name}', 'w') as writecsv:
writer = csv.writer(writecsv)
writer.writerow(['uri', 'link'])
for item in items:
writer.writerow([item.source_system_identifier]
+ [f'https://hdl.handle.net/{item.handle}'])


def create_metadata_id_list(metadata_csv):
"""Creates a list of IDs from a metadata CSV"""
metadata_ids = []
with open(metadata_csv) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
value = row['file_identifier']
metadata_ids.append(value)
for row in [r for r in reader if r['file_identifier'] != '']:
metadata_ids.append(row['file_identifier'])
return metadata_ids


Expand All @@ -49,23 +68,6 @@ def match_metadata_to_files(file_dict, metadata_ids):
return metadata_matches


def reconcile_files_and_metadata(metadata_csv, output_path, file_path,
file_type):
"""Runs a reconciliation of files and metadata."""
file_dict = create_file_dict(file_path, file_type)
file_ids = file_dict.keys()
metadata_ids = create_metadata_id_list(metadata_csv)
metadata_matches = match_metadata_to_files(file_dict, metadata_ids)
file_matches = match_files_to_metadata(file_dict, metadata_ids)
no_files = set(metadata_ids) - set(metadata_matches)
no_metadata = set(file_ids) - set(file_matches)
models.create_csv_from_list(no_metadata, f'{output_path}no_metadata')
models.create_csv_from_list(no_files, f'{output_path}no_files')
models.create_csv_from_list(metadata_matches,
f'{output_path}metadata_matches')
update_metadata_csv(metadata_csv, output_path, metadata_matches)


def update_metadata_csv(metadata_csv, output_path, metadata_matches):
"""Creates an updated CSV of metadata records with matching files."""
with open(metadata_csv) as csvfile:
Expand Down
Loading