Refactor data table routes

Refactors the route to get data tables at /data/ to separate the data table (yaml file) and the headers query code. Implements a second route to select only the table data.
HEPData · Feb 14, 2024 · 6445bd9 · 6445bd9
1 parent 882d6a9
commit 6445bd9
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 63 deletions.
diff --git a/hepdata/modules/records/utils/common.py b/hepdata/modules/records/utils/common.py
@@ -21,6 +21,8 @@
 # In applying this license, CERN does not
 # waive the privileges and immunities granted to it by virtue of its status
 # as an Intergovernmental Organization or submit itself to any jurisdiction.
+import yaml
+from yaml import CBaseLoader as Loader
 from invenio_db import db
 from invenio_pidstore.errors import PIDDoesNotExistError
 from invenio_pidstore.resolver import Resolver
@@ -30,7 +32,7 @@
 
 from hepdata.config import HISTFACTORY_FILE_TYPE, SIZE_LOAD_CHECK_THRESHOLD
 from hepdata.ext.opensearch.api import get_record
-from hepdata.modules.submission.models import HEPSubmission, License
+from hepdata.modules.submission.models import HEPSubmission, License, DataSubmission, DataResource
 
 FILE_TYPES = {
     "py": "Python",
@@ -252,21 +254,52 @@ def record_exists(*args, **kwargs):
     count = HEPSubmission.query.filter_by(**kwargs).count()
     return count > 0
 
+def load_table_data(recid, version):
+    """
+    Loads a specfic data file's yaml file data.
+
+    :param recid: The recid used for the query
+    :param version: The data version to select
+    :return table_contents: A dict containing the table data
+    """
+
+    datasub_query = DataSubmission.query.filter_by(id=recid, version=version)
+    table_contents = {}
+    if datasub_query.count() > 0:
+        datasub_record = datasub_query.one()
+        data_query = db.session.query(DataResource).filter(
+            DataResource.id == datasub_record.data_file)
+
+        if data_query.count() > 0:
+            data_record = data_query.one()
+            file_location = data_record.file_location
+
+            attempts = 0
+            while True:
+                try:
+                    with open(file_location, 'r') as table_file:
+                        table_contents = yaml.load(table_file, Loader=Loader)
+                except (FileNotFoundError, PermissionError) as e:
+                    attempts += 1
+                # allow multiple attempts to read file in case of temporary disk problems
+                if (table_contents and table_contents is not None) or attempts > 5:
+                    break
+
+    return table_contents
+
 
 def file_size_check(file_location, load_all):
     """
-        Decides if a file breaks the maximum size threshold
-            for immediate loading on the records page.
+    Decides if a file breaks the maximum size threshold
+        for immediate loading on the records page.
 
-        :param file_location: Location of the data file on disk
-        :param load_all: If the check should be run
-        :return bool: Pass or fail
+    :param file_location: Location of the data file on disk
+    :param load_all: If the check should be run
+    :return bool: Pass or fail
     """
-    size_check = { "status": True, "size": os.path.getsize(file_location) }
-    # We do the check only if told to, otherwise we just pass as true
-    if load_all == 0:
-        size_check["status"] = size_check["size"] <= SIZE_LOAD_CHECK_THRESHOLD
-    return size_check
+    size = os.path.getsize(file_location)
+    status = True if load_all == 1 else size <= SIZE_LOAD_CHECK_THRESHOLD
+    return { "size": size, "status": status}
 
 def generate_licence_data_by_id(licence_id):
     """

diff --git a/hepdata/modules/records/utils/data_processing_utils.py b/hepdata/modules/records/utils/data_processing_utils.py
@@ -199,37 +199,19 @@ def process_dependent_variables(group_count, record, table_contents,
         group_count += 1
 
 
-def generate_table_structure(table_contents):
+def generate_table_data(table_contents):
     """
-    Creates a renderable structure from the table structure we've defined.
+    Creates a renderable data table structure.
 
     :param table_contents:
-    :return: a dictionary encompassing the qualifiers, headers and values
+    :return: A dictionary containing the table headers/values
     """
-
-    record = {"name": table_contents["name"], "doi": table_contents["doi"],
-              "location": table_contents["location"],
-              "table_license": table_contents["table_license"],
-              "related_tables" : table_contents["related_tables"],
-              "related_to_this" : table_contents["related_to_this"],
-              "qualifiers": {},
-              "qualifier_order": [], "headers": [],
-              "review": table_contents["review"],
-              "associated_files": table_contents["associated_files"],
-              "keywords": {},
-              "values": [],
-              "size": table_contents["size"]}
-
-    record["description"] = sanitize_html(table_contents["title"])
-
-    # add in keywords
-    if table_contents['keywords'] is not None:
-        for keyword in table_contents['keywords']:
-            if keyword.name not in record['keywords']:
-                record['keywords'][keyword.name] = []
-
-            if keyword.value not in record['keywords'][keyword.name]:
-                record['keywords'][keyword.name].append(keyword.value)
+    record = {
+        "qualifier_order": [],
+        "headers": [],
+        "values": [],
+        "qualifiers": {}
+    }
 
     tmp_values = {}
     x_axes = OrderedDict()
@@ -264,6 +246,42 @@ def generate_table_structure(table_contents):
     return record
 
 
+def generate_table_headers(table_contents):
+    """
+    Prepares the table header data for rendering
+
+    :param table_contents:
+    :return: a dictionary encompassing the qualifiers, headers and values
+    """
+
+    record = {
+        "name": table_contents["name"],
+        "doi": table_contents["doi"],
+        "location": table_contents["location"],
+        "table_license": table_contents["table_license"],
+        "related_tables" : table_contents["related_tables"],
+        "related_to_this" : table_contents["related_to_this"],
+        "review": table_contents["review"],
+        "associated_files": table_contents["associated_files"],
+        "keywords": {},
+        "size": table_contents["size"],
+        "size_check": table_contents["size_check"]
+    }
+
+    record["description"] = sanitize_html(table_contents["title"])
+
+    # add in keywords
+    if table_contents['keywords'] is not None:
+        for keyword in table_contents['keywords']:
+            if keyword.name not in record['keywords']:
+                record['keywords'][keyword.name] = []
+
+            if keyword.value not in record['keywords'][keyword.name]:
+                record['keywords'][keyword.name].append(keyword.value)
+
+    return record
+
+
 def str_presenter(dumper, data):
     if "\n" in data:
         return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')

diff --git a/hepdata/modules/records/views.py b/hepdata/modules/records/views.py
@@ -55,9 +55,9 @@
 from hepdata.modules.submission.models import HEPSubmission, DataSubmission, \
     DataResource, DataReview, Message, Question
 from hepdata.modules.records.utils.common import get_record_by_id, \
-    default_time, IMAGE_TYPES, decode_string, file_size_check, generate_licence_data_by_id
+    default_time, IMAGE_TYPES, decode_string, file_size_check, generate_licence_data_by_id, load_table_data
 from hepdata.modules.records.utils.data_processing_utils import \
-    generate_table_structure, process_ctx
+    generate_table_headers, process_ctx, generate_table_data
 from hepdata.modules.records.utils.submission import create_data_review, \
     get_or_create_hepsubmission
 from hepdata.modules.submission.api import get_latest_hepsubmission
@@ -289,8 +289,23 @@ def get_latest():
     return jsonify(result)
 
 
-@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/<int:load_all>', methods=['GET'])
-def get_table_details(recid, data_recid, version, load_all=1):
+@blueprint.route('/data/tabledata/<int:data_recid>/<int:version>', methods=['GET'])
+def get_table_data(data_recid, version):
+    """
+    Gets the table data only for a specific recid/version.
+
+    :param data_recid: The data recid used for retrieval
+    :param version: The data version to retrieve
+    :return:
+    """
+    # Run the function to load table data and return
+    table_contents = load_table_data(data_recid, version)
+    return jsonify(generate_table_data(table_contents))
+
+
+@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/', defaults={'load_all': 1})
+@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/<int:load_all>')
+def get_table_details(recid, data_recid, version, load_all):
     """
     Get the table details of a given datasubmission.
 
@@ -312,28 +327,8 @@ def get_table_details(recid, data_recid, version, load_all=1):
         if data_query.count() > 0:
             data_record = data_query.one()
             file_location = data_record.file_location
-            load_fail = True
 
-            # Perform filesize check, returns the status and size of the file
             size_check = file_size_check(file_location, load_all)
-            if size_check["status"]:
-                attempts = 0
-                while True:
-                    try:
-                        with open(file_location, 'r') as table_file:
-                            table_contents = yaml.load(table_file, Loader=Loader)
-                            if table_contents:
-                                load_fail = False
-
-                    except (FileNotFoundError, PermissionError) as e:
-                        attempts += 1
-                    # allow multiple attempts to read file in case of temporary disk problems
-                    if (table_contents and table_contents is not None) or attempts > 5:
-                        break
-            if load_fail:
-                # TODO - Needs to be initialised for later
-                table_contents["dependent_variables"] = []
-                table_contents["independent_variables"] = []
 
             table_contents["name"] = datasub_record.name
             table_contents["title"] = datasub_record.description
@@ -344,6 +339,7 @@ def get_table_details(recid, data_recid, version, load_all=1):
             table_contents["doi"] = datasub_record.doi
             table_contents["location"] = datasub_record.location_in_publication
             table_contents["size"] = size_check["size"]
+            table_contents["size_check"] = size_check["status"]
 
         # we create a map of files mainly to accommodate the use of thumbnails for images where possible.
         tmp_assoc_files = {}
@@ -397,7 +393,15 @@ def get_table_details(recid, data_recid, version, load_all=1):
     # x and y headers (should not require a colspan)
     # values, that also encompass the errors
 
-    return jsonify(generate_table_structure(table_contents))
+    fixed_table = generate_table_headers(table_contents)
+
+    # If the size is below the threshold, we just pass the table contents now
+    if size_check["status"] or load_all == 1:
+        table_data = generate_table_data(load_table_data(data_recid, version))
+        # Combine the dictionaries if required
+        fixed_table = {**fixed_table, **table_data}
+
+    return jsonify(fixed_table)
 
 
 @blueprint.route('/coordinator/view/<int:recid>', methods=['GET', ])