Skip to content

Commit

Permalink
Enumerate unzipped files (#10842)
Browse files Browse the repository at this point in the history
* Keep all uploaded zip content accessible

iterdir() is platform dependent, that is the order of the returned items
may be different on different platforms. In cases where a zip file
contains multiple base_file candidates it will be overridden by the last
one found (which varies on different platforms).

Also, different files with the same extension (file1.csv, file2.csv) will not
be accessible from file_paths as they get overridden, too.

The fix enumerates all files to make them accessible from file_paths.

* Sorts files during unzip

Ensures that unpacked content is sorted before getting handled

* Resolve minor issues

* Ensure index on extensions found multiple times
  • Loading branch information
ridoo committed Oct 17, 2023
1 parent 47a9aa6 commit 9ca2f13
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 8 deletions.
31 changes: 24 additions & 7 deletions geonode/storage/data_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,17 +205,34 @@ def _unzip(self, zip_name: str) -> Mapping:
at the end the zip is deleted
"""
zip_file = self.file_paths["base_file"]
the_zip = zipfile.ZipFile(zip_file, allowZip64=True)
the_zip.extractall(self.temporary_folder)
with zipfile.ZipFile(zip_file, allowZip64=True) as the_zip:
the_zip.extractall(self.temporary_folder)

available_choices = get_allowed_extensions()
not_main_files = ["xml", "sld", "zip", "kmz"]
base_file_choices = [x for x in available_choices if x not in not_main_files]
for _file in Path(self.temporary_folder).iterdir():
if any([_file.name.endswith(_ext) for _ext in base_file_choices]):
self.file_paths["base_file"] = Path(str(_file))
elif not zipfile.is_zipfile(str(_file)):
sorted_files = sorted(Path(self.temporary_folder).iterdir())
for _file in sorted_files:
if not zipfile.is_zipfile(str(_file)):
if any([_file.name.endswith(_ext) for _ext in base_file_choices]):
self.file_paths["base_file"] = Path(str(_file))
ext = _file.name.split(".")[-1]
self.file_paths[f"{ext}_file"] = Path(str(_file))
if f"{ext}_file" in self.file_paths:
existing = self.file_paths[f"{ext}_file"]
self.file_paths[f"{ext}_file"] = [
Path(str(_file)),
*(existing if isinstance(existing, list) else [existing]),
]
else:
self.file_paths[f"{ext}_file"] = Path(str(_file))

tmp = self.file_paths.copy()
for key, value in self.file_paths.items():
if isinstance(value, list):
for index, file_path in enumerate(value):
n = f"{key}_{index}" if index > 0 else key
tmp[n] = file_path
self.file_paths = tmp

# remiving the zip file
os.remove(zip_name)
Expand Down
16 changes: 15 additions & 1 deletion geonode/storage/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,21 @@ def test_zip_file_should_correctly_recognize_main_extension_with_csv(self):

self.assertIsNotNone(storage_manager.data_retriever.temporary_folder)
_files = storage_manager.get_retrieved_paths()
self.assertTrue("example.csv" in _files.get("base_file"))
# Selected base_file is not defined in case of multiple csv files
self.assertTrue(_files.get("base_file").endswith(".csv"))

def test_zip_file_should_correctly_index_file_extensions(self):
# reinitiate the storage manager with the zip file
storage_manager = self.sut(
remote_files={"base_file": os.path.join(f"{self.project_root}", "tests/data/example.zip")}
)
storage_manager.clone_remote_files()

self.assertIsNotNone(storage_manager.data_retriever.temporary_folder)
_files = storage_manager.get_retrieved_paths()
self.assertIsNotNone(_files.get("csv_file"))
# extensions found more than once get indexed
self.assertIsNotNone(_files.get("csv_file_1"))

@override_settings(
SUPPORTED_DATASET_FILE_TYPES=[
Expand Down
Binary file modified geonode/storage/tests/data/example.zip
Binary file not shown.

0 comments on commit 9ca2f13

Please sign in to comment.