Closes #47 - updated datatype handling - Decimal, datetime.date, date…

…time.time (#70) * add Decimal handler and generator functions; clean up random_dataframe() arguments and add decimal_column/date_column/time_column * add datetime.date and datetime.time generators and handlers * check for and handle decimals and datetime.dates by default * return gpd.GeoSeries instead of GeometryArray * add boolean series generator option * add datatype imports with new directory structure * ignore flake8 C901 - "too complex" * add datatype compatibility helpers * add optional with_ipython_display argument to prevent calling IPython.display() on an object that goes through handle_format()
MSeal · Oct 13, 2022 · 816a165 · 816a165
1 parent 0e01d8d
commit 816a165
Show file tree

Hide file tree

Showing 20 changed files with 1,005 additions and 523 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -19,6 +19,8 @@ select =
     # docstrings must be triple-quoted, via flake8-docstrings
     D300
 ignore =
+    # "Too complex"
+    C901,
     # Extra space in brackets
     E20,
     E203,

diff --git a/src/dx/__init__.py b/src/dx/__init__.py
@@ -1,4 +1,5 @@
 from .comms import *
+from .datatypes import *
 from .dx import *
 from .formatters import *
 from .loggers import *

diff --git a/src/dx/datatypes/__init__.py b/src/dx/datatypes/__init__.py
@@ -0,0 +1,7 @@
+from .compatibility import *
+from .date_time import *
+from .geometry import *
+from .main import *
+from .misc import *
+from .numeric import *
+from .text import *
diff --git a/src/dx/datatypes/compatibility.py b/src/dx/datatypes/compatibility.py
@@ -0,0 +1,167 @@
+import traceback
+from typing import Any
+
+import pandas as pd
+from pandas.io.json import build_table_schema
+
+from dx.settings import get_settings
+
+settings = get_settings()
+
+
+def test_compatibility(value: Any, as_dataframe: bool = True) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    with the different steps involved with the dx display modes.
+    - pandas.io.json.build_table_schema (https://github.com/pandas-dev/pandas/blob/main/pandas/io/json/_table_schema.py)
+    - jupyter_client.jsonutil.json_clean (https://github.com/jupyter/jupyter_client/blob/main/jupyter_client/jsonutil.py)
+    - duckdb conn.register
+    - final dx output type
+    """
+    result = {}
+    result.update(test_build_table_schema(value))
+    result.update(test_json_clean(value))
+    result.update(test_db_write(value))
+    result.update(test_dx_handling(value))
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
+
+
+def test_build_table_schema(value: Any, as_dataframe: bool = False) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    with the pandas.io.json.build_table_schema function, which
+    is called to set up the initial column schema during dx formatting.
+    """
+    df = pd.DataFrame({"test": [value]})
+    result = {}
+
+    try:
+        schema = build_table_schema(df, index=False)
+        fields = schema["fields"]
+        field_type = [
+            field_schema["type"] for field_schema in fields if field_schema["name"] == "test"
+        ][0]
+        result["pandas.io.json.build_table_schema"] = {
+            "success": True,
+            "type": field_type,
+        }
+    except Exception as e:
+        result["pandas.io.json.build_table_schema"] = {
+            "error": str(e),
+            "success": False,
+            "traceback": traceback.format_exc(),
+        }
+
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
+
+
+def test_json_clean(value: Any, as_dataframe: bool = False) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    with the jupyter_client.jsonutil.json_clean function, which
+    is called during IPython.display after dx formatting.
+    """
+    df = pd.DataFrame({"test": [value]})
+    result = {}
+
+    try:
+        from jupyter_client.jsonutil import json_clean
+
+        clean_json = json_clean(df.to_dict("records"))
+        clean_json_value = clean_json[0]["test"]
+        result["jupyter_client.jsonutil.json_clean"] = {
+            "success": True,
+            "type": type(clean_json_value),
+            "value": clean_json_value,
+        }
+    except Exception as e:
+        result["jupyter_client.jsonutil.json_clean"] = {
+            "error": str(e),
+            "success": False,
+            "traceback": traceback.format_exc(),
+        }
+
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
+
+
+def test_db_write(value: Any, as_dataframe: bool = False) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    inside a pandas DataFrame during registration with a duckdb connection,
+    which is used during Datalink-enabled dataframe tracking for
+    push-down filtering.
+    """
+    from dx.utils.tracking import get_db_connection  # circular import
+
+    df = pd.DataFrame({"test": [value]})
+    result = {}
+
+    db_connection = get_db_connection()
+    try:
+        db_connection.register("test", df)
+        db_df = db_connection.execute("SELECT * FROM test").df()
+        db_df_value = db_df.iloc[0]["test"]
+        result["duckdb.conn.register"] = {
+            "type": type(db_df_value),
+            "success": True,
+            "value": db_df_value,
+        }
+    except Exception as e:
+        result["duckdb.conn.register"] = {
+            "error": str(e),
+            "success": False,
+            "traceback": traceback.format_exc(),
+        }
+
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
+
+
+def test_dx_handling(value: Any, as_dataframe: bool = False) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    inside a pandas DataFrame through the entire dx formatting
+    and data type handling process
+    """
+    from dx.formatters.main import handle_format  # circular import
+
+    df = pd.DataFrame({"test": [value]})
+    result = {}
+
+    try:
+        payload, _ = handle_format(df, with_ipython_display=False)
+
+        if settings.DISPLAY_MODE == "simple":
+            dx_value = payload[settings.MEDIA_TYPE]["data"][0]["test"]
+        if settings.DISPLAY_MODE == "enhanced":
+            dx_value = payload[settings.MEDIA_TYPE]["data"][0][0]
+
+        dx_schema_fields = payload[settings.MEDIA_TYPE]["schema"]["fields"]
+        # should only be two fields here by default: `index` and `test`
+        # but we wanted to run the entire formatting process, which doesn't need
+        # an option to disable `index` from being included
+        dx_schema_type = [field["type"] for field in dx_schema_fields if field["name"] == "test"][0]
+
+        result["dx.handle_format"] = {
+            "type": type(dx_value),
+            "success": True,
+            "value": dx_value,
+            "schema_type": dx_schema_type,
+        }
+    except Exception as e:
+        result["dx.handle_format"] = {
+            "error": str(e),
+            "success": False,
+            "traceback": traceback.format_exc(),
+        }
+
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
diff --git a/src/dx/utils/date_time.py → src/dx/datatypes/date_time.py b/src/dx/utils/date_time.py → src/dx/datatypes/date_time.py
@@ -21,6 +21,24 @@ def generate_datetime_series(num_rows: int) -> pd.Series:
     )
 
 
+def generate_date_series(num_rows: int) -> pd.Series:
+    return pd.Series(
+        [
+            (pd.Timestamp("now") + pd.Timedelta(f"{np.random.randint(-1000, 1000)} hours")).date()
+            for _ in range(num_rows)
+        ]
+    )
+
+
+def generate_time_series(num_rows: int) -> pd.Series:
+    return pd.Series(
+        [
+            (pd.Timestamp("now") + pd.Timedelta(f"{np.random.randint(-1000, 1000)} hours")).time()
+            for _ in range(num_rows)
+        ]
+    )
+
+
 def generate_time_period_series(num_rows: int) -> pd.Series:
     return pd.Series(
         [
@@ -70,6 +88,24 @@ def handle_time_delta_series(s: pd.Series) -> pd.Series:
     return s
 
 
+def handle_date_series(s: pd.Series) -> pd.Series:
+    types = (datetime.date,)
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(
+            f"series `{s.name}` has datetime.date values; converting with pd.to_datetime()"
+        )
+        s = pd.to_datetime(s)
+    return s
+
+
+def handle_time_series(s: pd.Series) -> pd.Series:
+    types = (datetime.time,)
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(f"series `{s.name}` has datetime.time values; converting to string")
+        s = s.astype(str)
+    return s
+
+
 def is_datetime_series(s: pd.Series) -> bool:
     if str(s.dtype) in ("int", "float", "bool", "category", "period", "interval"):
         return False

diff --git a/src/dx/utils/geometry.py → src/dx/datatypes/geometry.py b/src/dx/utils/geometry.py → src/dx/datatypes/geometry.py
@@ -25,7 +25,7 @@ def generate_latlon_series(num_rows: int):
 
     lats = [random.randint(-90, 89) + np.random.rand() for _ in range(num_rows)]
     lons = [random.randint(-180, 179) + np.random.rand() for _ in range(num_rows)]
-    return gpd.points_from_xy(lons, lats)
+    return gpd.GeoSeries(gpd.points_from_xy(lons, lats))
 
 
 def generate_filled_geojson_series(