Skip to content

Commit

Permalink
Consolidate methods for filtering filepaths by dates
Browse files Browse the repository at this point in the history
  • Loading branch information
jonavellecuerdo committed Apr 16, 2024
1 parent 7896046 commit b567de6
Showing 1 changed file with 20 additions and 32 deletions.
52 changes: 20 additions & 32 deletions harvester/harvest/alma.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""harvester.harvest.alma"""

# ruff: noqa: TRY003, EM101
import datetime
import glob
import logging
import re
Expand Down Expand Up @@ -141,44 +140,33 @@ def _filter_filepaths_by_dates(self, filepaths: list[str]) -> list[str]:
"""Filter list of XML files by date.
Given a list of XML files, the method will retrieve the date (YYYY-MM-DD)
from the filepath and determine whether the date is valid by checking
if the date falls within the specified 'from_date' and/or 'until_date'
arguments passed to the harvester.
from the filepath and check whether the date falls within the specified
'from_date' and/or 'until_date' arguments passed to the harvester.
Note: If 'from_date' and 'until_date' are not provided, the original list
of XML files will be returned. For full harvests, 'from_date' is required
and an error is thrown (by the full_harvest_get_source_records() method)
if not provided or cannot be derived.
Example filepath: alma-2024-03-01-daily-extracted-records-to-index_19.xml
- run_date=2024-03-01
"""
filtered_filepaths = []
for filepath in filepaths:
if _filepath_date := self._get_date_from_filepath(filepath):
filepath_date = convert_to_utc(date_parser(_filepath_date))
if self._validate_filepath_date(filepath_date):
filtered_filepaths.append(filepath)
return filtered_filepaths

def _validate_filepath_date(self, date: datetime.datetime) -> bool:
"""Check if a given date is valid.
if filepath_date_string := self._get_date_from_filepath(filepath):
filepath_date = convert_to_utc(date_parser(filepath_date_string))

# include where filepath date meets harvester from/until date criteria
if (
self.from_datetime_object is None
or filepath_date >= self.from_datetime_object
) and (
self.until_datetime_object is None
or filepath_date < self.until_datetime_object
):
filtered_filepaths.append(filepath)

Dates are valid if it meets at least one of the following condition:
* Date is (a) on or after MITAlmaHarvester.from_date AND (b) before
MITAlmaHarvester.until_date if 'from_date' and 'until_date' is specified.
* Date is on or after MITAlmaHarvester.from_date if only 'from_date' is specified.
* Date is before MITAlmaHarvester.until_date if only 'until_date' is specified.
"""
if (self.from_datetime_object and self.until_datetime_object) and (
date >= self.from_datetime_object and date < self.until_datetime_object
):
return True
if (self.from_datetime_object and self.until_datetime_object is None) and (
date >= self.from_datetime_object
):
return True
if (self.until_datetime_object and self.from_datetime_object is None) and (
date < self.until_datetime_object
):
return True
return False
return filtered_filepaths

def _list_xml_files(self) -> list[str]:
"""Retrieve list of XML files from S3 or local filesystem."""
Expand Down

0 comments on commit b567de6

Please sign in to comment.