Skip to content

Commit

Permalink
Refactor data table routes
Browse files Browse the repository at this point in the history
Refactors the route to get data tables at /data/ to separate the data table (yaml file) and the headers query code. Implements a second route to select only the table data.
  • Loading branch information
ItIsJordan committed Feb 14, 2024
1 parent 882d6a9 commit 6445bd9
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 63 deletions.
55 changes: 44 additions & 11 deletions hepdata/modules/records/utils/common.py
Expand Up @@ -21,6 +21,8 @@
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.
import yaml
from yaml import CBaseLoader as Loader
from invenio_db import db
from invenio_pidstore.errors import PIDDoesNotExistError
from invenio_pidstore.resolver import Resolver
Expand All @@ -30,7 +32,7 @@

from hepdata.config import HISTFACTORY_FILE_TYPE, SIZE_LOAD_CHECK_THRESHOLD
from hepdata.ext.opensearch.api import get_record
from hepdata.modules.submission.models import HEPSubmission, License
from hepdata.modules.submission.models import HEPSubmission, License, DataSubmission, DataResource

FILE_TYPES = {
"py": "Python",
Expand Down Expand Up @@ -252,21 +254,52 @@ def record_exists(*args, **kwargs):
count = HEPSubmission.query.filter_by(**kwargs).count()
return count > 0

def load_table_data(recid, version):
"""
Loads a specfic data file's yaml file data.
:param recid: The recid used for the query
:param version: The data version to select
:return table_contents: A dict containing the table data
"""

datasub_query = DataSubmission.query.filter_by(id=recid, version=version)
table_contents = {}
if datasub_query.count() > 0:
datasub_record = datasub_query.one()
data_query = db.session.query(DataResource).filter(
DataResource.id == datasub_record.data_file)

if data_query.count() > 0:
data_record = data_query.one()
file_location = data_record.file_location

attempts = 0
while True:
try:
with open(file_location, 'r') as table_file:
table_contents = yaml.load(table_file, Loader=Loader)
except (FileNotFoundError, PermissionError) as e:
attempts += 1
# allow multiple attempts to read file in case of temporary disk problems
if (table_contents and table_contents is not None) or attempts > 5:
break

return table_contents


def file_size_check(file_location, load_all):
"""
Decides if a file breaks the maximum size threshold
for immediate loading on the records page.
Decides if a file breaks the maximum size threshold
for immediate loading on the records page.
:param file_location: Location of the data file on disk
:param load_all: If the check should be run
:return bool: Pass or fail
:param file_location: Location of the data file on disk
:param load_all: If the check should be run
:return bool: Pass or fail
"""
size_check = { "status": True, "size": os.path.getsize(file_location) }
# We do the check only if told to, otherwise we just pass as true
if load_all == 0:
size_check["status"] = size_check["size"] <= SIZE_LOAD_CHECK_THRESHOLD
return size_check
size = os.path.getsize(file_location)
status = True if load_all == 1 else size <= SIZE_LOAD_CHECK_THRESHOLD
return { "size": size, "status": status}

def generate_licence_data_by_id(licence_id):
"""
Expand Down
72 changes: 45 additions & 27 deletions hepdata/modules/records/utils/data_processing_utils.py
Expand Up @@ -199,37 +199,19 @@ def process_dependent_variables(group_count, record, table_contents,
group_count += 1


def generate_table_structure(table_contents):
def generate_table_data(table_contents):
"""
Creates a renderable structure from the table structure we've defined.
Creates a renderable data table structure.
:param table_contents:
:return: a dictionary encompassing the qualifiers, headers and values
:return: A dictionary containing the table headers/values
"""

record = {"name": table_contents["name"], "doi": table_contents["doi"],
"location": table_contents["location"],
"table_license": table_contents["table_license"],
"related_tables" : table_contents["related_tables"],
"related_to_this" : table_contents["related_to_this"],
"qualifiers": {},
"qualifier_order": [], "headers": [],
"review": table_contents["review"],
"associated_files": table_contents["associated_files"],
"keywords": {},
"values": [],
"size": table_contents["size"]}

record["description"] = sanitize_html(table_contents["title"])

# add in keywords
if table_contents['keywords'] is not None:
for keyword in table_contents['keywords']:
if keyword.name not in record['keywords']:
record['keywords'][keyword.name] = []

if keyword.value not in record['keywords'][keyword.name]:
record['keywords'][keyword.name].append(keyword.value)
record = {
"qualifier_order": [],
"headers": [],
"values": [],
"qualifiers": {}
}

tmp_values = {}
x_axes = OrderedDict()
Expand Down Expand Up @@ -264,6 +246,42 @@ def generate_table_structure(table_contents):
return record


def generate_table_headers(table_contents):
"""
Prepares the table header data for rendering
:param table_contents:
:return: a dictionary encompassing the qualifiers, headers and values
"""

record = {
"name": table_contents["name"],
"doi": table_contents["doi"],
"location": table_contents["location"],
"table_license": table_contents["table_license"],
"related_tables" : table_contents["related_tables"],
"related_to_this" : table_contents["related_to_this"],
"review": table_contents["review"],
"associated_files": table_contents["associated_files"],
"keywords": {},
"size": table_contents["size"],
"size_check": table_contents["size_check"]
}

record["description"] = sanitize_html(table_contents["title"])

# add in keywords
if table_contents['keywords'] is not None:
for keyword in table_contents['keywords']:
if keyword.name not in record['keywords']:
record['keywords'][keyword.name] = []

if keyword.value not in record['keywords'][keyword.name]:
record['keywords'][keyword.name].append(keyword.value)

return record


def str_presenter(dumper, data):
if "\n" in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
Expand Down
54 changes: 29 additions & 25 deletions hepdata/modules/records/views.py
Expand Up @@ -55,9 +55,9 @@
from hepdata.modules.submission.models import HEPSubmission, DataSubmission, \
DataResource, DataReview, Message, Question
from hepdata.modules.records.utils.common import get_record_by_id, \
default_time, IMAGE_TYPES, decode_string, file_size_check, generate_licence_data_by_id
default_time, IMAGE_TYPES, decode_string, file_size_check, generate_licence_data_by_id, load_table_data
from hepdata.modules.records.utils.data_processing_utils import \
generate_table_structure, process_ctx
generate_table_headers, process_ctx, generate_table_data
from hepdata.modules.records.utils.submission import create_data_review, \
get_or_create_hepsubmission
from hepdata.modules.submission.api import get_latest_hepsubmission
Expand Down Expand Up @@ -289,8 +289,23 @@ def get_latest():
return jsonify(result)


@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/<int:load_all>', methods=['GET'])
def get_table_details(recid, data_recid, version, load_all=1):
@blueprint.route('/data/tabledata/<int:data_recid>/<int:version>', methods=['GET'])
def get_table_data(data_recid, version):
"""
Gets the table data only for a specific recid/version.
:param data_recid: The data recid used for retrieval
:param version: The data version to retrieve
:return:
"""
# Run the function to load table data and return
table_contents = load_table_data(data_recid, version)
return jsonify(generate_table_data(table_contents))


@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/', defaults={'load_all': 1})
@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/<int:load_all>')
def get_table_details(recid, data_recid, version, load_all):
"""
Get the table details of a given datasubmission.
Expand All @@ -312,28 +327,8 @@ def get_table_details(recid, data_recid, version, load_all=1):
if data_query.count() > 0:
data_record = data_query.one()
file_location = data_record.file_location
load_fail = True

# Perform filesize check, returns the status and size of the file
size_check = file_size_check(file_location, load_all)
if size_check["status"]:
attempts = 0
while True:
try:
with open(file_location, 'r') as table_file:
table_contents = yaml.load(table_file, Loader=Loader)
if table_contents:
load_fail = False

except (FileNotFoundError, PermissionError) as e:
attempts += 1
# allow multiple attempts to read file in case of temporary disk problems
if (table_contents and table_contents is not None) or attempts > 5:
break
if load_fail:
# TODO - Needs to be initialised for later
table_contents["dependent_variables"] = []
table_contents["independent_variables"] = []

table_contents["name"] = datasub_record.name
table_contents["title"] = datasub_record.description
Expand All @@ -344,6 +339,7 @@ def get_table_details(recid, data_recid, version, load_all=1):
table_contents["doi"] = datasub_record.doi
table_contents["location"] = datasub_record.location_in_publication
table_contents["size"] = size_check["size"]
table_contents["size_check"] = size_check["status"]

# we create a map of files mainly to accommodate the use of thumbnails for images where possible.
tmp_assoc_files = {}
Expand Down Expand Up @@ -397,7 +393,15 @@ def get_table_details(recid, data_recid, version, load_all=1):
# x and y headers (should not require a colspan)
# values, that also encompass the errors

return jsonify(generate_table_structure(table_contents))
fixed_table = generate_table_headers(table_contents)

# If the size is below the threshold, we just pass the table contents now
if size_check["status"] or load_all == 1:
table_data = generate_table_data(load_table_data(data_recid, version))
# Combine the dictionaries if required
fixed_table = {**fixed_table, **table_data}

return jsonify(fixed_table)


@blueprint.route('/coordinator/view/<int:recid>', methods=['GET', ])
Expand Down

0 comments on commit 6445bd9

Please sign in to comment.