From 6445bd97b36c8a4ced1d1dea44ac10b3a1ccba33 Mon Sep 17 00:00:00 2001 From: Jordan <21129425+ItIsJordan@users.noreply.github.com> Date: Wed, 14 Feb 2024 13:34:00 +0000 Subject: [PATCH] Refactor data table routes Refactors the route to get data tables at /data/ to separate the data table (yaml file) and the headers query code. Implements a second route to select only the table data. --- hepdata/modules/records/utils/common.py | 55 +++++++++++--- .../records/utils/data_processing_utils.py | 72 ++++++++++++------- hepdata/modules/records/views.py | 54 +++++++------- 3 files changed, 118 insertions(+), 63 deletions(-) diff --git a/hepdata/modules/records/utils/common.py b/hepdata/modules/records/utils/common.py index be6634a0..47d9448d 100644 --- a/hepdata/modules/records/utils/common.py +++ b/hepdata/modules/records/utils/common.py @@ -21,6 +21,8 @@ # In applying this license, CERN does not # waive the privileges and immunities granted to it by virtue of its status # as an Intergovernmental Organization or submit itself to any jurisdiction. +import yaml +from yaml import CBaseLoader as Loader from invenio_db import db from invenio_pidstore.errors import PIDDoesNotExistError from invenio_pidstore.resolver import Resolver @@ -30,7 +32,7 @@ from hepdata.config import HISTFACTORY_FILE_TYPE, SIZE_LOAD_CHECK_THRESHOLD from hepdata.ext.opensearch.api import get_record -from hepdata.modules.submission.models import HEPSubmission, License +from hepdata.modules.submission.models import HEPSubmission, License, DataSubmission, DataResource FILE_TYPES = { "py": "Python", @@ -252,21 +254,52 @@ def record_exists(*args, **kwargs): count = HEPSubmission.query.filter_by(**kwargs).count() return count > 0 +def load_table_data(recid, version): + """ + Loads a specfic data file's yaml file data. + + :param recid: The recid used for the query + :param version: The data version to select + :return table_contents: A dict containing the table data + """ + + datasub_query = DataSubmission.query.filter_by(id=recid, version=version) + table_contents = {} + if datasub_query.count() > 0: + datasub_record = datasub_query.one() + data_query = db.session.query(DataResource).filter( + DataResource.id == datasub_record.data_file) + + if data_query.count() > 0: + data_record = data_query.one() + file_location = data_record.file_location + + attempts = 0 + while True: + try: + with open(file_location, 'r') as table_file: + table_contents = yaml.load(table_file, Loader=Loader) + except (FileNotFoundError, PermissionError) as e: + attempts += 1 + # allow multiple attempts to read file in case of temporary disk problems + if (table_contents and table_contents is not None) or attempts > 5: + break + + return table_contents + def file_size_check(file_location, load_all): """ - Decides if a file breaks the maximum size threshold - for immediate loading on the records page. + Decides if a file breaks the maximum size threshold + for immediate loading on the records page. - :param file_location: Location of the data file on disk - :param load_all: If the check should be run - :return bool: Pass or fail + :param file_location: Location of the data file on disk + :param load_all: If the check should be run + :return bool: Pass or fail """ - size_check = { "status": True, "size": os.path.getsize(file_location) } - # We do the check only if told to, otherwise we just pass as true - if load_all == 0: - size_check["status"] = size_check["size"] <= SIZE_LOAD_CHECK_THRESHOLD - return size_check + size = os.path.getsize(file_location) + status = True if load_all == 1 else size <= SIZE_LOAD_CHECK_THRESHOLD + return { "size": size, "status": status} def generate_licence_data_by_id(licence_id): """ diff --git a/hepdata/modules/records/utils/data_processing_utils.py b/hepdata/modules/records/utils/data_processing_utils.py index fe8a517d..7875c483 100644 --- a/hepdata/modules/records/utils/data_processing_utils.py +++ b/hepdata/modules/records/utils/data_processing_utils.py @@ -199,37 +199,19 @@ def process_dependent_variables(group_count, record, table_contents, group_count += 1 -def generate_table_structure(table_contents): +def generate_table_data(table_contents): """ - Creates a renderable structure from the table structure we've defined. + Creates a renderable data table structure. :param table_contents: - :return: a dictionary encompassing the qualifiers, headers and values + :return: A dictionary containing the table headers/values """ - - record = {"name": table_contents["name"], "doi": table_contents["doi"], - "location": table_contents["location"], - "table_license": table_contents["table_license"], - "related_tables" : table_contents["related_tables"], - "related_to_this" : table_contents["related_to_this"], - "qualifiers": {}, - "qualifier_order": [], "headers": [], - "review": table_contents["review"], - "associated_files": table_contents["associated_files"], - "keywords": {}, - "values": [], - "size": table_contents["size"]} - - record["description"] = sanitize_html(table_contents["title"]) - - # add in keywords - if table_contents['keywords'] is not None: - for keyword in table_contents['keywords']: - if keyword.name not in record['keywords']: - record['keywords'][keyword.name] = [] - - if keyword.value not in record['keywords'][keyword.name]: - record['keywords'][keyword.name].append(keyword.value) + record = { + "qualifier_order": [], + "headers": [], + "values": [], + "qualifiers": {} + } tmp_values = {} x_axes = OrderedDict() @@ -264,6 +246,42 @@ def generate_table_structure(table_contents): return record +def generate_table_headers(table_contents): + """ + Prepares the table header data for rendering + + :param table_contents: + :return: a dictionary encompassing the qualifiers, headers and values + """ + + record = { + "name": table_contents["name"], + "doi": table_contents["doi"], + "location": table_contents["location"], + "table_license": table_contents["table_license"], + "related_tables" : table_contents["related_tables"], + "related_to_this" : table_contents["related_to_this"], + "review": table_contents["review"], + "associated_files": table_contents["associated_files"], + "keywords": {}, + "size": table_contents["size"], + "size_check": table_contents["size_check"] + } + + record["description"] = sanitize_html(table_contents["title"]) + + # add in keywords + if table_contents['keywords'] is not None: + for keyword in table_contents['keywords']: + if keyword.name not in record['keywords']: + record['keywords'][keyword.name] = [] + + if keyword.value not in record['keywords'][keyword.name]: + record['keywords'][keyword.name].append(keyword.value) + + return record + + def str_presenter(dumper, data): if "\n" in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') diff --git a/hepdata/modules/records/views.py b/hepdata/modules/records/views.py index 5c6e8edd..5c257f13 100644 --- a/hepdata/modules/records/views.py +++ b/hepdata/modules/records/views.py @@ -55,9 +55,9 @@ from hepdata.modules.submission.models import HEPSubmission, DataSubmission, \ DataResource, DataReview, Message, Question from hepdata.modules.records.utils.common import get_record_by_id, \ - default_time, IMAGE_TYPES, decode_string, file_size_check, generate_licence_data_by_id + default_time, IMAGE_TYPES, decode_string, file_size_check, generate_licence_data_by_id, load_table_data from hepdata.modules.records.utils.data_processing_utils import \ - generate_table_structure, process_ctx + generate_table_headers, process_ctx, generate_table_data from hepdata.modules.records.utils.submission import create_data_review, \ get_or_create_hepsubmission from hepdata.modules.submission.api import get_latest_hepsubmission @@ -289,8 +289,23 @@ def get_latest(): return jsonify(result) -@blueprint.route('/data////', methods=['GET']) -def get_table_details(recid, data_recid, version, load_all=1): +@blueprint.route('/data/tabledata//', methods=['GET']) +def get_table_data(data_recid, version): + """ + Gets the table data only for a specific recid/version. + + :param data_recid: The data recid used for retrieval + :param version: The data version to retrieve + :return: + """ + # Run the function to load table data and return + table_contents = load_table_data(data_recid, version) + return jsonify(generate_table_data(table_contents)) + + +@blueprint.route('/data////', defaults={'load_all': 1}) +@blueprint.route('/data////') +def get_table_details(recid, data_recid, version, load_all): """ Get the table details of a given datasubmission. @@ -312,28 +327,8 @@ def get_table_details(recid, data_recid, version, load_all=1): if data_query.count() > 0: data_record = data_query.one() file_location = data_record.file_location - load_fail = True - # Perform filesize check, returns the status and size of the file size_check = file_size_check(file_location, load_all) - if size_check["status"]: - attempts = 0 - while True: - try: - with open(file_location, 'r') as table_file: - table_contents = yaml.load(table_file, Loader=Loader) - if table_contents: - load_fail = False - - except (FileNotFoundError, PermissionError) as e: - attempts += 1 - # allow multiple attempts to read file in case of temporary disk problems - if (table_contents and table_contents is not None) or attempts > 5: - break - if load_fail: - # TODO - Needs to be initialised for later - table_contents["dependent_variables"] = [] - table_contents["independent_variables"] = [] table_contents["name"] = datasub_record.name table_contents["title"] = datasub_record.description @@ -344,6 +339,7 @@ def get_table_details(recid, data_recid, version, load_all=1): table_contents["doi"] = datasub_record.doi table_contents["location"] = datasub_record.location_in_publication table_contents["size"] = size_check["size"] + table_contents["size_check"] = size_check["status"] # we create a map of files mainly to accommodate the use of thumbnails for images where possible. tmp_assoc_files = {} @@ -397,7 +393,15 @@ def get_table_details(recid, data_recid, version, load_all=1): # x and y headers (should not require a colspan) # values, that also encompass the errors - return jsonify(generate_table_structure(table_contents)) + fixed_table = generate_table_headers(table_contents) + + # If the size is below the threshold, we just pass the table contents now + if size_check["status"] or load_all == 1: + table_data = generate_table_data(load_table_data(data_recid, version)) + # Combine the dictionaries if required + fixed_table = {**fixed_table, **table_data} + + return jsonify(fixed_table) @blueprint.route('/coordinator/view/', methods=['GET', ])