feat: adds Dataset class to api

MontOpsInc · Feb 22, 2023 · 0f38fbc · 0f38fbc
1 parent 4ca3968
commit 0f38fbc
Show file tree

Hide file tree

Showing 9 changed files with 7,420 additions and 41 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,8 @@ dacite = "^1.8.0"
 boto3 = "^1.26.74"
 certifi = "^2022.12.7"
 dataconf = "^2.1.3"
+requests = "^2.28.2"
+requests-toolbelt = "^0.10.1"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.0.4"

diff --git a/requirements.txt b/requirements.txt
@@ -4,15 +4,107 @@ boto3==1.26.76 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0
 botocore==1.29.76 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0" \
     --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \
     --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7
-certifi==2022.12.7 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0" \
+certifi==2022.12.7 ; python_full_version >= "3.8.1" and python_version < "4" \
     --hash=sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3 \
     --hash=sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18
+charset-normalizer==3.0.1 ; python_full_version >= "3.8.1" and python_version < "4" \
+    --hash=sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b \
+    --hash=sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42 \
+    --hash=sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d \
+    --hash=sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b \
+    --hash=sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a \
+    --hash=sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59 \
+    --hash=sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154 \
+    --hash=sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1 \
+    --hash=sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c \
+    --hash=sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a \
+    --hash=sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d \
+    --hash=sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6 \
+    --hash=sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b \
+    --hash=sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b \
+    --hash=sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783 \
+    --hash=sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5 \
+    --hash=sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918 \
+    --hash=sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555 \
+    --hash=sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639 \
+    --hash=sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786 \
+    --hash=sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e \
+    --hash=sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed \
+    --hash=sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820 \
+    --hash=sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8 \
+    --hash=sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3 \
+    --hash=sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541 \
+    --hash=sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14 \
+    --hash=sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be \
+    --hash=sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e \
+    --hash=sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76 \
+    --hash=sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b \
+    --hash=sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c \
+    --hash=sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b \
+    --hash=sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3 \
+    --hash=sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc \
+    --hash=sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6 \
+    --hash=sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59 \
+    --hash=sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4 \
+    --hash=sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d \
+    --hash=sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d \
+    --hash=sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3 \
+    --hash=sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a \
+    --hash=sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea \
+    --hash=sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6 \
+    --hash=sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e \
+    --hash=sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603 \
+    --hash=sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24 \
+    --hash=sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a \
+    --hash=sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58 \
+    --hash=sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678 \
+    --hash=sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a \
+    --hash=sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c \
+    --hash=sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6 \
+    --hash=sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18 \
+    --hash=sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174 \
+    --hash=sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317 \
+    --hash=sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f \
+    --hash=sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc \
+    --hash=sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837 \
+    --hash=sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41 \
+    --hash=sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c \
+    --hash=sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579 \
+    --hash=sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753 \
+    --hash=sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8 \
+    --hash=sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291 \
+    --hash=sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087 \
+    --hash=sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866 \
+    --hash=sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3 \
+    --hash=sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d \
+    --hash=sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1 \
+    --hash=sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca \
+    --hash=sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e \
+    --hash=sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db \
+    --hash=sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72 \
+    --hash=sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d \
+    --hash=sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc \
+    --hash=sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539 \
+    --hash=sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d \
+    --hash=sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af \
+    --hash=sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b \
+    --hash=sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602 \
+    --hash=sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f \
+    --hash=sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478 \
+    --hash=sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c \
+    --hash=sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e \
+    --hash=sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479 \
+    --hash=sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7 \
+    --hash=sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8
 dacite==1.8.0 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0" \
     --hash=sha256:6257a5e505b61a8cafee7ef3ad08cf32ee9b885718f42395d017e0a9b4c6af65 \
     --hash=sha256:f7b1205cc5d9b62835aac8cbc1e6e37c1da862359a401f1edbe2ae08fbdc6193
 dataconf==2.1.3 ; python_full_version >= "3.8.1" and python_version < "4.0" \
     --hash=sha256:27232e828d435c67c45f4097165447ac831497719dcf86d10d35f42c52be0dd1 \
     --hash=sha256:9c542a61d70b614b7e022cfb93a5932b9aa271981899f4d5f4611e018badb045
+idna==3.4 ; python_full_version >= "3.8.1" and python_version < "4" \
+    --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \
+    --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2
 jmespath==1.0.1 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0" \
     --hash=sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 \
     --hash=sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe
@@ -162,6 +254,12 @@ pyyaml==6.0 ; python_full_version >= "3.8.1" and python_version < "4.0" \
     --hash=sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f \
     --hash=sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174 \
     --hash=sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5
+requests-toolbelt==0.10.1 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0" \
+    --hash=sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7 \
+    --hash=sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d
+requests==2.28.2 ; python_full_version >= "3.8.1" and python_version < "4" \
+    --hash=sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa \
+    --hash=sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf
 s3transfer==0.6.0 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0" \
     --hash=sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd \
     --hash=sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947
@@ -171,6 +269,6 @@ six==1.16.0 ; python_full_version >= "3.8.1" and python_version < "4.0" \
 typing-extensions==4.5.0 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0" \
     --hash=sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb \
     --hash=sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4
-urllib3==1.26.14 ; python_full_version >= "3.8.1" and python_full_version < "4.0.0" \
+urllib3==1.26.14 ; python_full_version >= "3.8.1" and python_version < "4" \
     --hash=sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72 \
     --hash=sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1
diff --git a/src/pymsx/api/__init__.py b/src/pymsx/api/__init__.py
@@ -0,0 +1 @@
+"""Module for classes managing api access to remote msx server."""
diff --git a/src/pymsx/api/datasets.py b/src/pymsx/api/datasets.py
@@ -0,0 +1,208 @@
+"""Datasets Class used to access remote datasets and metadata.
+
+Datasets represent remote data and their versions. Data is organized by `dataset name`
+and versions. Versions are created automatically, and associating versions is done by
+reusing existing `dataset names`.
+
+For example, if uploading file `nlp_train.csv` today, the msx server will store that
+dataset under the dataset name `nlp_train`. If I then another, newer file also named
+`nlp_train.csv`, the msx server will automatically associate them, and store the second
+dataset as `version 2`.
+
+The same applies to dataframes, passed in with a name, instead of a file path.
+
+Classes:
+    Datasets
+"""
+import io
+import logging
+import pathlib
+from typing import Any, Callable, Optional
+
+import pandas as pd
+import requests
+from requests_toolbelt.multipart.encoder import (
+    MultipartEncoder,
+    MultipartEncoderMonitor,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class Datasets:
+    """
+    Main class for managing datasets on remote msx servers.
+
+    ...
+
+    Args:
+        client (:obj: `MsxClient`): the client used to perform remote api requests
+        test_local: (bool, optiona): use a localhost address for msx server.
+
+    Attributes:
+        client (:obj: `MsxClient`): the client used to perform remote api requests
+        test_local: (bool, optiona): use a localhost address for msx server.
+    """
+
+    def __init__(self, client, test_local: bool = False):
+        """Create a new Datasets class."""
+        self.client = client
+        self.test_local = test_local
+
+    def __get_df_size(self, df: pd.DataFrame):
+        try:
+            return df.memory_usage(index=True).sum()
+        except Exception:
+            # size is not essential
+            return None
+
+    def __get_mime_type(self, ext: str = ".csv"):
+        if ext == ".csv":
+            return "text/csv"
+        elif ext == ".parquet":
+            return "application/parquet"
+        elif ext == ".json":
+            return "application/parquet"
+        else:
+            return "text/plain"
+
+    def __get_pd_read_func(self, ext: str = ".csv"):
+        read_func = pd.read_csv
+        if ext == ".csv":
+            pass
+        elif ext == ".parquet":
+            read_func = pd.read_parquet
+        elif ext == ".json":
+            read_func = pd.read_json
+        else:
+            # must be unreachable
+            pass
+        return read_func
+
+    def __convert_df_to_bytes(self, df: pd.DataFrame, ext: str = ".csv") -> io.BytesIO:
+        if ext == ".csv":
+            data = io.BytesIO(df.to_csv(index=False).encode("utf-8"))
+        elif ext == ".parquet":
+            data = io.BytesIO(df.to_parquet(index=False))
+        elif ext == ".json":
+            data = io.BytesIO(df.to_json(index=False).encode("utf-8"))
+        else:
+            # must be unreachable
+            data = io.BytesIO(df.to_csv(index=False).encode("utf-8"))
+        data.seek(0)
+        return data
+
+    def add(
+        self,
+        path_or_name: str,
+        df: Optional[pd.DataFrame] = None,
+        target: Optional[str] = None,
+        store_s3: bool = False,
+        df_read_args: Optional[dict[str, Any]] = None,
+        callback: Optional[Callable[[MultipartEncoderMonitor], None]] = None,
+        **kwargs,
+    ):
+        """
+        Add a dataset to the connected msx server.
+
+        If df is None, then first arg is used as `dataset name` otherwise it must be
+        a path to a dataset (file on disk, or soon a location that pandas can parse,
+        such as s3)
+
+        Args:
+            path_or_name (str): Either a dataset name (if df is provided), or the path
+                to a file on disk.
+            df (:ob: `pandas.DataFrame`): A pandas dataframe
+            target (str, optional): The target (column) of the data that will be used
+                when training. If no target is provided, then the last column will be
+                used.
+            store_s3 (bool, optional): Data can be stored in the isolated msx
+                environment, or it can be stored in an accessible (secure) S3 bucket
+                that every msx server includes.
+            df_read_args (dict[str, Any], optional): If df is not defined, then
+                optionally pass in pandas read_* kwargs.
+            **kwargs: If kwargs are provided, they will be serialized to dict[str, str]
+                and passed to the upload server as is. This is useful because it allows
+                passing additional fields to any pipelines or triggers configured to
+                run after upload
+
+        Returns
+        -------
+        {
+            path: str,
+            **kwargs
+        }
+        """
+        filename = path_or_name
+
+        # READ
+        if df is None:
+            # attempt reading path_or_name as path
+            path = pathlib.Path(path_or_name)
+            filename = path.name
+            path_ext = path.suffix
+
+            allowed_ext = self.client.config.allowed_read_exts
+
+            if path_ext not in allowed_ext:
+                raise ValueError(f"Could not read path type {path_ext}")
+
+            if path_ext == "":
+                raise ValueError("Path extension could not be determined.")
+
+            # read path
+            read_func = self.__get_pd_read_func(ext=path_ext)
+
+            if df_read_args is not None:
+                df = read_func(path_or_name, **df_read_args)
+            else:
+                df = read_func(path_or_name)
+
+        # WRITE
+
+        # for now using `.csv` for everything write related
+        write_ext = ".csv"
+
+        callback = callback or default_monitor
+
+        target = target or "default"
+        extra = {}
+        if kwargs is not None:
+            extra = {
+                k: (str(v), io.BytesIO(bytes(str(v), "utf-8")), "text/plain")
+                for k, v in kwargs.items()
+            }
+
+        filename = f"/s3/{filename}" if store_s3 else f"/datasets/{filename}"
+
+        data = self.__convert_df_to_bytes(df, ext=write_ext)
+
+        # stream = StreamingIterator(size, data)
+
+        e = MultipartEncoder(
+            fields={
+                **extra,
+                "file": (filename, data, self.__get_mime_type(write_ext)),
+                "target": (target, io.BytesIO(bytes(target, "utf-8")), "text/plain"),
+            }
+        )
+        m = MultipartEncoderMonitor(e, callback)
+
+        if self.test_local:
+            url = "http://localhost:8080/upload"
+        else:
+            url = f"{self.client.base_url}/upload"
+
+        auth_headers = self.client.get_auth_headers()
+        auth_headers = self.client.add_org_header(headers=auth_headers)
+
+        res = requests.post(
+            url, data=m, headers={**auth_headers, "Content-type": e.content_type}
+        )
+
+        return res.json()
+
+
+def default_monitor(monitor: MultipartEncoderMonitor) -> None:
+    """Monitor for MultipartEncodeMonitor."""
+    logger.debug(f"Bytes read: {monitor.bytes_read}")