Add ObjectStoreDataset, ObjectStoreDatasetManager.

NOAA-OWP · Mar 16, 2022 · 08bbfe8 · 08bbfe8
1 parent 37c034a
commit 08bbfe8
Showing 1 changed file with 369 additions and 0 deletions.
diff --git a/python/lib/modeldata/dmod/modeldata/data/object_store_dataset.py b/python/lib/modeldata/dmod/modeldata/data/object_store_dataset.py
@@ -0,0 +1,369 @@
+from .meta_data import DataCategory, DataDomain, DataFormat, TimeRange
+from .dataset import Dataset, DatasetManager
+from datetime import datetime
+from minio import Minio
+from minio.api import ObjectWriteResult
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from uuid import UUID
+
+
+class ObjectStoreDataset(Dataset):
+
+    _OBJECT_NAME_SEPARATOR = "___"
+    """ Separator for individual parts (e.g., corresponding to directories) of an object name. """
+
+    @classmethod
+    def additional_init_param_deserialized(cls, json_obj: dict) -> Dict[str, Any]:
+        """
+        Deserialize any other params needed for this type's init function, returning in a map for ``kwargs`` use.
+
+        The main ::method:`factory_init_from_deserialized_json` class method for the base ::class:`Dataset` type handles
+        a large amount of the work for deserialization.  However, subtypes could have additional params they require
+        in their ::method:`__init__`.  This function should do this deserialization work for any subtype, and return a
+        deserialized dictionary.  The keys should be the names of the relevant ::method:`__init__` parameters.
+
+        In the event a type's ::method:`__init__` method takes no additional params beyond the base type, its
+        implementation of this function should return an empty dictionary.
+
+        Any types with an init that does not have one or more of the params of the base type's init should fully
+        override ::method:`factory_init_from_deserialized_json`.
+
+        Parameters
+        ----------
+        json_obj : dict
+            The serialized form of the object that is a subtype of ::class:`Dataset`.
+
+        Returns
+        -------
+        Dict[str, Any]
+            A dictionary of ``kwargs`` for those init params and values beyond what the base type uses.
+        """
+        # TODO: update docstring a bit for this once finalized nothing is needed
+        return dict()
+
+    def __init__(self, name: str, category: DataCategory, data_domain: DataDomain, access_location: str,
+                 uuid: Optional[UUID] = None, manager: Optional[DatasetManager] = None,
+                 manager_uuid: Optional[UUID] = None, is_read_only: bool = True, expires: Optional[datetime] = None,
+                 derived_from: Optional[str] = None, derivations: Optional[List[str]] = None,
+                 created_on: Optional[datetime] = None, last_updated: Optional[datetime] = None):
+        super(ObjectStoreDataset, self).__init__(name, category, data_domain, access_location, uuid, manager,
+                                                 manager_uuid, is_read_only, expires, derived_from, derivations,
+                                                 created_on, last_updated)
+        # TODO: remove this if there ends up being no use for it
+
+    def add_file(self, file: Path, add_relative_to: Optional[Path] = None) -> bool:
+        """
+
+        Parameters
+        ----------
+        file
+        add_relative_to
+
+        Returns
+        -------
+        bool
+            Whether the file was added successfully
+        """
+        bucket_root = file.parent if add_relative_to is None else add_relative_to
+        return self.manager.add_data(dataset_name=self.name, file=file, bucket_root=bucket_root)
+
+    def add_files(self, directory: Path, add_relative_to: Optional[Path] = None) -> bool:
+        """
+        Add all files in the given directory to this existing object store.
+
+        Parameters
+        ----------
+        files : List[Path]
+            The files to be added
+        """
+        bucket_root = directory if add_relative_to is None else add_relative_to
+        return self.manager.add_data(dataset_name=self.name, directory=directory, bucket_root=bucket_root)
+
+    @property
+    def files(self) -> List[str]:
+        """
+        List of files in this dataset, relative to dataset root.
+
+        Returns
+        -------
+        List[str]
+            List of files in this dataset, relative to dataset root.
+        """
+        return self.manager.list_files(self.name, bucket_name=self.name)
+
+
+class ObjectStoreDatasetManager(DatasetManager):
+    """
+    Implementation specifically for ::class:`ObjectStore` instances.
+    """
+
+    _OBJECT_NAME_SEPARATOR = "___"
+    """ Separator for individual parts (e.g., corresponding to directories) of an object name. """
+
+    def __init__(self, minio_host_str: str, access_key: Optional[str] = None, secret_key: Optional[str] = None,
+                 datasets: Optional[Dict[str, Dataset]] = None):
+        super(ObjectStoreDatasetManager, self).__init__(datasets)
+        # TODO: add checks to ensure all datasets passed to this type are ObjectStoreDataset
+        self._minio_host_str = minio_host_str
+        self._client = Minio(minio_host_str, access_key=access_key, secret_key=secret_key)
+
+    def _decode_object_name_to_file_path(self, object_name: str) -> str:
+        """
+        Reverse the object name encoding of nested file names.
+
+        This essentially just decodes the encoding performed by ::method:`_push_file`.
+
+        Parameters
+        ----------
+        object_name : str
+            The name of the object storing some previously uploaded file data.
+
+        Returns
+        -------
+        str
+            The decoded file name that reflects any subdirectory structure.
+
+        See Also
+        -------
+        ::method:`_push_file`
+        """
+        return "/".join(object_name.split(self._OBJECT_NAME_SEPARATOR))
+
+    # TODO: add stuff for threading
+    def _push_file(self, bucket_name: str, file: Path, bucket_root: Path, do_checks: bool = True) -> ObjectWriteResult:
+        """
+        Push a file to a bucket.
+
+        Buckets simulate subdirectories by encoding relative directory structure into object names.  This relative
+        structure is based on a bucket "root" directory, corresponding to the directory used to create this dataset.
+
+        E.g. perhaps there existing the ``dataset_1/`` directory, which needs to be uploaded to a dataset, with
+        structure:
+
+            dataset_1/
+            dataset_1/file_1
+            dataset_1/file_2
+            dataset_1/subdir_a/
+            dataset_1/subdir_a/file_a_1
+            dataset_1/subdir_a/file_a_2
+            dataset_1/subdir_b/
+            dataset_1/subdir_b/file_b_1
+
+        Assume ``dataset_1/`` is the "root", and there already exists a ``file_1`` object in the bucket for
+        ``dataset_1/file_1``.  When it is time to for this method to push ``dataset_1/subdir_a/file_a_1``, the file is
+        associated with an object name that encodes the subdirectory structure:  ``subdir_a___file_a_1``.  The separator
+        comes from the class variable ::attribute:`_OBJECT_NAME_SEPARATOR`.
+
+        Parameters
+        ----------
+        bucket_name : str
+            The name of the existing bucket to push to.
+        file : Path
+            The path to a non-directory file to push to the bucket.
+        bucket_root : Path
+            The directory level that corresponds to the bucket's root level, for object naming purposes.
+        do_checks : bool
+            Whether to do sanity checks on the local file and bucket root (``True`` by default).
+
+        Returns
+        -------
+        ObjectWriteResult
+        """
+        if do_checks:
+            if not file.exists():
+                raise RuntimeError("Cannot push non-existing file {} to bucket {}".format(str(file), bucket_name))
+            elif not file.is_file():
+                raise RuntimeError("Cannot push non-regular file {} to bucket {}".format(str(file), bucket_name))
+            elif not file.is_relative_to(bucket_root):
+                msg = "Cannot push {} to bucket {} when provided bad or non-relative bucket root {}"
+                raise RuntimeError(msg.format(str(file), bucket_name, str(bucket_root)))
+        file_rel_root = file.relative_to(bucket_root)
+        object_name = self._OBJECT_NAME_SEPARATOR.join(file_rel_root.parts)
+        return self._client.fput_object(bucket_name=bucket_name, object_name=object_name, file_path=str(file))
+
+    # TODO: might need to add the threading stuff in this function when ready to add it
+    def _push_files(self, bucket_name: str, dir_path: Path, recursive: bool = True, bucket_root: Optional[Path] = None,
+                    do_checks: bool = True):
+        """
+        Push the file contents of the given directory to the provided bucket.
+
+        Parameters
+        ----------
+        bucket_name : str
+            The name of the existing bucket to push to.
+        dir_path : Path
+            The name of the existing directory containing files to push to the bucket.
+        recursive : bool
+            Whether contents of nested directories, their inner directory contents, etc., should all be pushed, as
+            opposed to only regular files that are immediate children of ``dir_path`` (``True`` by default)
+        bucket_root : Optional[Path]
+            The directory level that corresponds to the bucket's root level, for object naming purposes.
+        do_checks : bool
+            Whether to do sanity checks on the local file and bucket root (``True`` by default).
+        """
+        if do_checks:
+            if not dir_path.exists():
+                msg = "Cannot push files from non-existing directory {} to bucket {}"
+                raise RuntimeError(msg.format(str(dir_path), bucket_name))
+            elif not dir_path.is_dir():
+                msg = "Cannot push directory contents from non-directory {} to bucket {}"
+                raise RuntimeError(msg.format(str(dir_path), bucket_name))
+        if bucket_root is None:
+            bucket_root = dir_path
+        # First take care of immediate
+        for file in [f for f in dir_path.iterdir() if f.is_file()]:
+            self._push_file(bucket_name, file, bucket_root, do_checks=False)
+        if recursive:
+            for directory in [d for d in dir_path.iterdir() if d.is_dir()]:
+                self._push_files(bucket_name, directory, recursive, bucket_root, do_checks=False)
+
+    def add_data(self, dataset_name: str, **kwargs) -> bool:
+        """
+        Add data in some format to the dataset.
+
+        Implementations must be responsible for managing (via their use of ``kwargs``) the way data is accepted, what
+        kinds of data can be accepted, and whether what data has been passed is supported.
+
+        Parameters
+        ----------
+        dataset_name : str
+            The dataset to which to add data.
+        kwargs
+            Implementation-specific params for representing the data and details of how it should be added.
+
+        Keyword Args
+        ----------
+        file : Path
+            When present, path to a file to be added (either ``file`` or ``directory`` must be present).
+        directory : Path
+            When present, path to a directory of files to be added.
+        bucket_root : Path
+            The directory level that corresponds to the bucket's root level, for object naming purposes (defaults to the
+            parent of ``file`` if used, or to ``directory`` itself if used).
+
+        Returns
+        -------
+        bool
+            Whether the data was added successfully.
+        """
+        if dataset_name not in self.datasets:
+            return False
+        elif 'file' in kwargs:
+            bucket_root = kwargs['bucket_root'] if 'bucket_root' in kwargs else kwargs['file'].parent
+            result = self._push_file(bucket_name=dataset_name, file=kwargs['file'], bucket_root=bucket_root)
+            # TODO: test
+            return isinstance(result.object_name, str)
+        elif 'directory' in kwargs:
+            bucket_root = kwargs['bucket_root'] if 'bucket_root' in kwargs else kwargs['directory']
+            self._push_files(bucket_name=dataset_name, dir_path=kwargs['directory'], bucket_root=bucket_root)
+            # TODO: probably need something better than just always returning True if this gets executed
+            return True
+        else:
+            return False
+
+    def create(self, name: str, category: DataCategory, data_format: DataFormat, is_read_only: bool,
+               files_dir: Optional[Path] = None, time_range: Optional[TimeRange] = None,
+               expect_bucket_exists: bool = False, recurse_dir: bool = True, **kwargs) -> ObjectStoreDataset:
+        """
+        Create a new ::class:`ObjectStoreDataset` instance.
+
+        Parameters
+        ----------
+        name : str
+            The name for the new dataset.
+        category : DataCategory
+            The data category for the new dataset.
+        data_format : DataFormat
+            The data format for the new dataset.
+        is_read_only : bool
+            Whether the new dataset is read-only.
+        files_dir : Optional[Path]
+            Optional path to a directory containing initial data for the dataset (essential for read-only datasets).
+        time_range : Optional[TimeRange]
+            Optional time range over which the created dataset has data.
+        expect_bucket_exists : bool
+            Whether it is expected that the associated object store bucket is already created (default: ``False``).
+        recurse_dir : bool
+            Whether subdirectories under ``files_dir`` and their contents should also be added as initial data.
+        kwargs
+            Implementation specific args.
+
+        Returns
+        -------
+        Dataset
+            A newly created dataset instance ready for use.
+        """
+        if name in self.datasets:
+            raise RuntimeError("Cannot create new dataset with name {}: name already in use".format(name))
+        does_bucket_exist = self._client.bucket_exists(name)
+        if not expect_bucket_exists and does_bucket_exist:
+            raise RuntimeError("Unexpected existing bucket when creating dataset {}".format(name))
+        elif expect_bucket_exists and not does_bucket_exist:
+            raise RuntimeError("Expected bucket to exist when creating dataset {}".format(name))
+        elif not expect_bucket_exists:
+            self._client.make_bucket(name)
+
+        if files_dir is not None:
+            if not files_dir.is_dir():
+                raise RuntimeError("Invalid non-directory path {} passed for dataset files directory".format(files_dir))
+            else:
+                self._push_files(bucket_name=name, dir_path=files_dir, recursive=recurse_dir)
+
+        if is_read_only and len(list(self._client.list_objects(name, recursive=True))) == 0:
+            msg = "Attempting to create read-only dataset {} without supplying it with any initial data"
+            raise RuntimeError(msg.format(name))
+
+        created_on = datetime.now()
+        return ObjectStoreDataset(name=name, category=category, data_format=data_format, manager=self,
+                                  access_location=self._minio_host_str, is_read_only=is_read_only,
+                                  created_on=created_on, last_updated=created_on)
+
+    def list_buckets(self) -> List[str]:
+        """
+        List currently existing object store buckets.
+
+        Returns
+        -------
+        List[str]
+            A list of the names of currently existing object store buckets
+        """
+        return [bucket.name for bucket in self._client.list_buckets()]
+
+    def list_files(self, dataset_name: str, **kwargs) -> List[str]:
+        """
+        List the files in the dataset of the provided name, with names decoded relative to dataset "root".
+
+        Parameters
+        ----------
+        dataset_name : str
+            The dataset name.
+
+        Returns
+        -------
+        List[str]
+            List of files in dataset of the provided name, relative to dataset root.
+
+        See Also
+        -------
+        ::method:`_push_file`
+        ::method:`_decode_object_name_to_file_path`
+        """
+        if dataset_name not in self.datasets:
+            raise RuntimeError("Unrecognized dataset name {} given to request to list files".format(dataset_name))
+        objects = self._client.list_objects(dataset_name, recursive=True)
+        return [self._decode_object_name_to_file_path(object_name=str(obj)) for obj in objects]
+
+    def get_bucket_creation_times(self) -> Dict[str, datetime]:
+        """
+        Get a dictionary of the creation times for existing buckets, keyed by bucket name.
+
+        Returns
+        -------
+        Dict[str, datetime]
+            A dictionary of the creation times for existing buckets, keyed by bucket name.
+        """
+        values = dict()
+        for bucket in self._client.list_buckets():
+            values[bucket.name] = bucket.creation_date
+        return values