From 816a165a7f27d599c3171de97181cb95e74d6593 Mon Sep 17 00:00:00 2001
From: Dave Shoup <dave.shoup@gmail.com>
Date: Thu, 13 Oct 2022 17:01:53 -0400
Subject: [PATCH] Closes #47 - updated datatype handling - Decimal,
 datetime.date, datetime.time (#70)

* add Decimal handler and generator functions; clean up random_dataframe() arguments and add decimal_column/date_column/time_column
* add datetime.date and datetime.time generators and handlers
* check for and handle decimals and datetime.dates by default
* return gpd.GeoSeries instead of GeometryArray
* add boolean series generator option
* add datatype imports with new directory structure
* ignore flake8 C901 - "too complex"
* add datatype compatibility helpers
* add optional with_ipython_display argument to prevent calling IPython.display() on an object that goes through handle_format()
---
 setup.cfg                                |   2 +
 src/dx/__init__.py                       |   1 +
 src/dx/datatypes/__init__.py             |   7 +
 src/dx/datatypes/compatibility.py        | 167 ++++++++++
 src/dx/{utils => datatypes}/date_time.py |  36 +++
 src/dx/{utils => datatypes}/geometry.py  |   2 +-
 src/dx/datatypes/main.py                 | 177 +++++++++++
 src/dx/datatypes/misc.py                 | 140 ++++++++
 src/dx/datatypes/numeric.py              |  43 +++
 src/dx/datatypes/text.py                 |  31 ++
 src/dx/formatters/main.py                |  39 ++-
 src/dx/utils/__init__.py                 |   3 -
 src/dx/utils/datatypes.py                | 387 -----------------------
 src/dx/utils/formatting.py               |  93 +++++-
 tests/conftest.py                        |   2 +-
 tests/test_dataresource.py               |   2 +-
 tests/test_datatype_handling.py          |   2 +-
 tests/test_datatypes.py                  | 339 +++++++++++++-------
 tests/test_dx.py                         |   2 +-
 tests/test_formatting.py                 |  53 +++-
 20 files changed, 1005 insertions(+), 523 deletions(-)
 create mode 100644 src/dx/datatypes/__init__.py
 create mode 100644 src/dx/datatypes/compatibility.py
 rename src/dx/{utils => datatypes}/date_time.py (69%)
 rename src/dx/{utils => datatypes}/geometry.py (98%)
 create mode 100644 src/dx/datatypes/main.py
 create mode 100644 src/dx/datatypes/misc.py
 create mode 100644 src/dx/datatypes/numeric.py
 create mode 100644 src/dx/datatypes/text.py
 delete mode 100644 src/dx/utils/datatypes.py

diff --git a/setup.cfg b/setup.cfg
index 501a9624..f1f9c9b8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,6 +19,8 @@ select =
     # docstrings must be triple-quoted, via flake8-docstrings
     D300
 ignore =
+    # "Too complex"
+    C901,
     # Extra space in brackets
     E20,
     E203,
diff --git a/src/dx/__init__.py b/src/dx/__init__.py
index 342b7ecd..cf5a5a73 100644
--- a/src/dx/__init__.py
+++ b/src/dx/__init__.py
@@ -1,4 +1,5 @@
 from .comms import *
+from .datatypes import *
 from .dx import *
 from .formatters import *
 from .loggers import *
diff --git a/src/dx/datatypes/__init__.py b/src/dx/datatypes/__init__.py
new file mode 100644
index 00000000..4b2cd633
--- /dev/null
+++ b/src/dx/datatypes/__init__.py
@@ -0,0 +1,7 @@
+from .compatibility import *
+from .date_time import *
+from .geometry import *
+from .main import *
+from .misc import *
+from .numeric import *
+from .text import *
diff --git a/src/dx/datatypes/compatibility.py b/src/dx/datatypes/compatibility.py
new file mode 100644
index 00000000..6ef53a84
--- /dev/null
+++ b/src/dx/datatypes/compatibility.py
@@ -0,0 +1,167 @@
+import traceback
+from typing import Any
+
+import pandas as pd
+from pandas.io.json import build_table_schema
+
+from dx.settings import get_settings
+
+settings = get_settings()
+
+
+def test_compatibility(value: Any, as_dataframe: bool = True) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    with the different steps involved with the dx display modes.
+    - pandas.io.json.build_table_schema (https://github.com/pandas-dev/pandas/blob/main/pandas/io/json/_table_schema.py)
+    - jupyter_client.jsonutil.json_clean (https://github.com/jupyter/jupyter_client/blob/main/jupyter_client/jsonutil.py)
+    - duckdb conn.register
+    - final dx output type
+    """
+    result = {}
+    result.update(test_build_table_schema(value))
+    result.update(test_json_clean(value))
+    result.update(test_db_write(value))
+    result.update(test_dx_handling(value))
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
+
+
+def test_build_table_schema(value: Any, as_dataframe: bool = False) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    with the pandas.io.json.build_table_schema function, which
+    is called to set up the initial column schema during dx formatting.
+    """
+    df = pd.DataFrame({"test": [value]})
+    result = {}
+
+    try:
+        schema = build_table_schema(df, index=False)
+        fields = schema["fields"]
+        field_type = [
+            field_schema["type"] for field_schema in fields if field_schema["name"] == "test"
+        ][0]
+        result["pandas.io.json.build_table_schema"] = {
+            "success": True,
+            "type": field_type,
+        }
+    except Exception as e:
+        result["pandas.io.json.build_table_schema"] = {
+            "error": str(e),
+            "success": False,
+            "traceback": traceback.format_exc(),
+        }
+
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
+
+
+def test_json_clean(value: Any, as_dataframe: bool = False) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    with the jupyter_client.jsonutil.json_clean function, which
+    is called during IPython.display after dx formatting.
+    """
+    df = pd.DataFrame({"test": [value]})
+    result = {}
+
+    try:
+        from jupyter_client.jsonutil import json_clean
+
+        clean_json = json_clean(df.to_dict("records"))
+        clean_json_value = clean_json[0]["test"]
+        result["jupyter_client.jsonutil.json_clean"] = {
+            "success": True,
+            "type": type(clean_json_value),
+            "value": clean_json_value,
+        }
+    except Exception as e:
+        result["jupyter_client.jsonutil.json_clean"] = {
+            "error": str(e),
+            "success": False,
+            "traceback": traceback.format_exc(),
+        }
+
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
+
+
+def test_db_write(value: Any, as_dataframe: bool = False) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    inside a pandas DataFrame during registration with a duckdb connection,
+    which is used during Datalink-enabled dataframe tracking for
+    push-down filtering.
+    """
+    from dx.utils.tracking import get_db_connection  # circular import
+
+    df = pd.DataFrame({"test": [value]})
+    result = {}
+
+    db_connection = get_db_connection()
+    try:
+        db_connection.register("test", df)
+        db_df = db_connection.execute("SELECT * FROM test").df()
+        db_df_value = db_df.iloc[0]["test"]
+        result["duckdb.conn.register"] = {
+            "type": type(db_df_value),
+            "success": True,
+            "value": db_df_value,
+        }
+    except Exception as e:
+        result["duckdb.conn.register"] = {
+            "error": str(e),
+            "success": False,
+            "traceback": traceback.format_exc(),
+        }
+
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
+
+
+def test_dx_handling(value: Any, as_dataframe: bool = False) -> dict:
+    """
+    Convenience function to test the compatibility of a given object
+    inside a pandas DataFrame through the entire dx formatting
+    and data type handling process
+    """
+    from dx.formatters.main import handle_format  # circular import
+
+    df = pd.DataFrame({"test": [value]})
+    result = {}
+
+    try:
+        payload, _ = handle_format(df, with_ipython_display=False)
+
+        if settings.DISPLAY_MODE == "simple":
+            dx_value = payload[settings.MEDIA_TYPE]["data"][0]["test"]
+        if settings.DISPLAY_MODE == "enhanced":
+            dx_value = payload[settings.MEDIA_TYPE]["data"][0][0]
+
+        dx_schema_fields = payload[settings.MEDIA_TYPE]["schema"]["fields"]
+        # should only be two fields here by default: `index` and `test`
+        # but we wanted to run the entire formatting process, which doesn't need
+        # an option to disable `index` from being included
+        dx_schema_type = [field["type"] for field in dx_schema_fields if field["name"] == "test"][0]
+
+        result["dx.handle_format"] = {
+            "type": type(dx_value),
+            "success": True,
+            "value": dx_value,
+            "schema_type": dx_schema_type,
+        }
+    except Exception as e:
+        result["dx.handle_format"] = {
+            "error": str(e),
+            "success": False,
+            "traceback": traceback.format_exc(),
+        }
+
+    if as_dataframe:
+        return pd.DataFrame(result).transpose()
+    return result
diff --git a/src/dx/utils/date_time.py b/src/dx/datatypes/date_time.py
similarity index 69%
rename from src/dx/utils/date_time.py
rename to src/dx/datatypes/date_time.py
index 4038ba27..73e119c1 100644
--- a/src/dx/utils/date_time.py
+++ b/src/dx/datatypes/date_time.py
@@ -21,6 +21,24 @@ def generate_datetime_series(num_rows: int) -> pd.Series:
     )
 
 
+def generate_date_series(num_rows: int) -> pd.Series:
+    return pd.Series(
+        [
+            (pd.Timestamp("now") + pd.Timedelta(f"{np.random.randint(-1000, 1000)} hours")).date()
+            for _ in range(num_rows)
+        ]
+    )
+
+
+def generate_time_series(num_rows: int) -> pd.Series:
+    return pd.Series(
+        [
+            (pd.Timestamp("now") + pd.Timedelta(f"{np.random.randint(-1000, 1000)} hours")).time()
+            for _ in range(num_rows)
+        ]
+    )
+
+
 def generate_time_period_series(num_rows: int) -> pd.Series:
     return pd.Series(
         [
@@ -70,6 +88,24 @@ def handle_time_delta_series(s: pd.Series) -> pd.Series:
     return s
 
 
+def handle_date_series(s: pd.Series) -> pd.Series:
+    types = (datetime.date,)
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(
+            f"series `{s.name}` has datetime.date values; converting with pd.to_datetime()"
+        )
+        s = pd.to_datetime(s)
+    return s
+
+
+def handle_time_series(s: pd.Series) -> pd.Series:
+    types = (datetime.time,)
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(f"series `{s.name}` has datetime.time values; converting to string")
+        s = s.astype(str)
+    return s
+
+
 def is_datetime_series(s: pd.Series) -> bool:
     if str(s.dtype) in ("int", "float", "bool", "category", "period", "interval"):
         return False
diff --git a/src/dx/utils/geometry.py b/src/dx/datatypes/geometry.py
similarity index 98%
rename from src/dx/utils/geometry.py
rename to src/dx/datatypes/geometry.py
index 83d65631..324c27e3 100644
--- a/src/dx/utils/geometry.py
+++ b/src/dx/datatypes/geometry.py
@@ -25,7 +25,7 @@ def generate_latlon_series(num_rows: int):
 
     lats = [random.randint(-90, 89) + np.random.rand() for _ in range(num_rows)]
     lons = [random.randint(-180, 179) + np.random.rand() for _ in range(num_rows)]
-    return gpd.points_from_xy(lons, lats)
+    return gpd.GeoSeries(gpd.points_from_xy(lons, lats))
 
 
 def generate_filled_geojson_series(
diff --git a/src/dx/datatypes/main.py b/src/dx/datatypes/main.py
new file mode 100644
index 00000000..00379f9c
--- /dev/null
+++ b/src/dx/datatypes/main.py
@@ -0,0 +1,177 @@
+import numpy as np
+import pandas as pd
+import structlog
+
+from dx.datatypes import date_time, geometry, misc, numeric, text
+
+logger = structlog.get_logger(__name__)
+
+# this is primarily used for testing to match the optional
+# data types used for random dataframe generation,
+# and should match the keyword arguments available in `random_dataframe()``
+DX_DATATYPES = {
+    "dtype_column": True,
+    "integer_column": True,
+    "float_column": True,
+    "bool_column": False,
+    "decimal_column": False,
+    "datetime_column": True,
+    "date_column": False,
+    "time_column": False,
+    "time_delta_column": False,
+    "time_period_column": False,
+    "time_interval_column": False,
+    "text_column": False,
+    "keyword_column": True,
+    "dict_column": False,
+    "list_column": False,
+    "nested_tabular_column": False,
+    "latlon_point_column": False,
+    "filled_geojson_column": False,
+    "exterior_geojson_column": False,
+    "bytes_column": True,
+    "ipv4_address_column": False,
+    "ipv6_address_column": False,
+    "complex_number_column": False,
+}
+# specifically used for pytest.mark.parametrize ordering
+SORTED_DX_DATATYPES = sorted(list(DX_DATATYPES.keys()))
+
+
+def quick_random_dataframe(
+    num_rows: int = 5,
+    num_cols: int = 2,
+    dtype: str = "float",
+    factor: float = 1.0,
+) -> pd.DataFrame:
+    """
+    Convenience function wrapping `pd.DataFrame(np.random.rand( num_rows, num_columns ))`
+    to create a dataframe of random 0.0-1.0 values.
+    """
+    data = np.random.rand(num_rows, num_cols) * factor
+    df = pd.DataFrame(data)
+    return df.astype(dtype, errors="ignore")
+
+
+def random_dataframe(
+    num_rows: int = 5,
+    dtype_column: bool = True,
+    integer_column: bool = True,
+    float_column: bool = True,
+    bool_column: bool = False,
+    decimal_column: bool = False,
+    datetime_column: bool = True,
+    date_column: bool = False,
+    time_column: bool = False,
+    time_delta_column: bool = False,
+    time_period_column: bool = False,
+    time_interval_column: bool = False,
+    text_column: bool = False,
+    keyword_column: bool = True,
+    dict_column: bool = False,
+    list_column: bool = False,
+    nested_tabular_column: bool = False,
+    latlon_point_column: bool = False,
+    filled_geojson_column: bool = False,
+    exterior_geojson_column: bool = False,
+    bytes_column: bool = True,
+    ipv4_address_column: bool = False,
+    ipv6_address_column: bool = False,
+    complex_number_column: bool = False,
+):  # noqa: C901
+    """
+    Convenience function to generate a dataframe of `num_rows` length
+    with mixed data types.
+    """
+    df = pd.DataFrame(index=list(range(num_rows)))
+
+    if dtype_column:
+        df["dtype_column"] = misc.generate_dtype_series(num_rows)
+
+    if bool_column:
+        df["bool_column"] = misc.generate_boolean_series(num_rows)
+
+    # numeric columns
+    if integer_column:
+        df["integer_column"] = numeric.generate_integer_series(num_rows)
+
+    if float_column:
+        df["float_column"] = numeric.generate_float_series(num_rows)
+
+    if decimal_column:
+        df["decimal_column"] = numeric.generate_decimal_series(num_rows)
+
+    if complex_number_column:
+        df["complex_number_column"] = numeric.generate_complex_number_series(num_rows)
+
+    # date/time columns
+    if datetime_column:
+        df["datetime_column"] = date_time.generate_datetime_series(num_rows)
+
+    if date_column:
+        df["date_column"] = date_time.generate_date_series(num_rows)
+
+    if time_column:
+        df["time_column"] = date_time.generate_time_series(num_rows)
+
+    if time_delta_column:
+        df["time_delta_column"] = date_time.generate_time_delta_series(num_rows)
+
+    if time_period_column:
+        df["time_period_column"] = date_time.generate_time_period_series(num_rows)
+
+    if time_interval_column:
+        df["time_interval_column"] = date_time.generate_time_interval_series(num_rows)
+
+    # string columns
+    if text_column:
+        df["text_column"] = text.generate_text_series(num_rows)
+
+    if keyword_column:
+        df["keyword_column"] = text.generate_keyword_series(num_rows)
+
+    # container columns
+    if dict_column:
+        df["dict_column"] = misc.generate_dict_series(num_rows)
+
+    if list_column:
+        df["list_column"] = misc.generate_list_series(num_rows)
+
+    if nested_tabular_column:
+        df["nested_tabular_column"] = generate_nested_tabular_series(
+            num_rows,
+            float_column=True,
+            keyword_column=True,
+        )
+
+    # geopandas/shapely columns
+    if latlon_point_column:
+        df["latlon_point_column"] = geometry.generate_latlon_series(num_rows)
+
+    if filled_geojson_column:
+        df["filled_geojson_column"] = geometry.generate_filled_geojson_series(num_rows)
+
+    if exterior_geojson_column:
+        df["exterior_geojson_column"] = geometry.generate_exterior_bounds_geojson_series(num_rows)
+
+    # extras
+    if bytes_column:
+        df["bytes_column"] = misc.generate_bytes_series(num_rows)
+
+    if ipv4_address_column:
+        df["ipv4_address_column"] = misc.generate_ipv4_series(num_rows)
+
+    if ipv6_address_column:
+        df["ipv6_address_column"] = misc.generate_ipv6_series(num_rows)
+
+    return df
+
+
+# not adding this to datatypes/misc.py due to circular import
+def generate_nested_tabular_series(num_rows: int, num_nested_rows: int = 5, **kwargs) -> pd.Series:
+    return pd.Series(
+        [
+            random_dataframe(num_rows=num_nested_rows, **kwargs).to_dict("records")
+            for _ in range(num_rows)
+        ]
+    )
diff --git a/src/dx/datatypes/misc.py b/src/dx/datatypes/misc.py
new file mode 100644
index 00000000..fdbd22f8
--- /dev/null
+++ b/src/dx/datatypes/misc.py
@@ -0,0 +1,140 @@
+import ipaddress
+import json
+import random
+
+import numpy as np
+import pandas as pd
+import structlog
+
+logger = structlog.get_logger(__name__)
+
+
+### Generator helper functions ###
+def generate_boolean_series(num_rows: int) -> pd.Series:
+    return pd.Series([random.choice([True, False]) for _ in range(num_rows)])
+
+
+def generate_dtype_series(num_rows: int) -> pd.Series:
+    return pd.Series(
+        [random.choice([float, int, str, bool, set, tuple, dict, list]) for _ in range(num_rows)]
+    )
+
+
+def generate_dict_series(num_rows: int) -> pd.Series:
+    return pd.Series(
+        [
+            {
+                "nested_property": random.choice(["apple", "banana", "orange", "pear"]),
+                "nested_other_property": random.randint(0, 10),
+                "nested_bool": random.choice([True, False]),
+            }
+            for _ in range(num_rows)
+        ]
+    )
+
+
+def generate_list_series(num_rows: int) -> pd.Series:
+    return pd.Series([[random.randint(0, 5) for _ in range(5)] for _ in range(num_rows)])
+
+
+def generate_bytes_series(num_rows: int, n_bytes: int = 10) -> pd.Series:
+    return pd.Series([np.random.bytes(n_bytes) for _ in range(num_rows)])
+
+
+def generate_ipv4_series(num_rows: int) -> pd.Series:
+    def random_ipv4():
+        address_str = ".".join(str(random.randint(0, 255)) for _ in range(4))
+        return ipaddress.ip_address(address_str)
+
+    return pd.Series([random_ipv4() for _ in range(num_rows)])
+
+
+def generate_ipv6_series(num_rows: int) -> pd.Series:
+    def random_ipv6():
+        address_str = ":".join(
+            str(hex(random.randint(0, 65_535))).replace("0x", "") for _ in range(8)
+        )
+        return ipaddress.ip_address(address_str)
+
+    return pd.Series([random_ipv6() for _ in range(num_rows)])
+
+
+### Handler helper functions ###
+def handle_dict_series(s: pd.Series) -> pd.Series:
+    types = dict
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(f"series `{s.name}` has dicts; converting to json string")
+        s = s.apply(lambda x: json.dumps(x) if isinstance(x, types) else x)
+    return s
+
+
+def handle_dtype_series(s: pd.Series):
+    """
+    Casts dtypes as strings.
+    """
+    types = (type, np.dtype)
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(f"series `{s.name}` has types; converting to strings")
+        s = s.astype(str)
+    return s
+
+
+def handle_interval_series(s: pd.Series) -> pd.Series:
+    types = pd.Interval
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(f"series `{s.name}` has intervals; converting to left/right")
+        s = s.apply(lambda x: [x.left, x.right] if isinstance(x, types) else x)
+    return s
+
+
+def handle_ip_address_series(s: pd.Series) -> pd.Series:
+    types = (ipaddress.IPv4Address, ipaddress.IPv6Address)
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(f"series `{s.name}` has ip addresses; converting to strings")
+        s = s.astype(str)
+    return s
+
+
+def handle_sequence_series(s: pd.Series) -> pd.Series:
+    types = (list, tuple, set, np.ndarray)
+    if is_sequence_series(s):
+        logger.debug(f"series `{s.name}` has sequences; converting to comma-separated string")
+        s = s.apply(lambda x: ", ".join([str(val) for val in x] if isinstance(x, types) else x))
+    return s
+
+
+def handle_unk_type_series(s: pd.Series) -> pd.Series:
+    if not is_json_serializable(s):
+        logger.debug(f"series `{s.name}` has non-JSON-serializable types; converting to string")
+        s = s.astype(str)
+    return s
+
+
+### Type checking helper functions ###
+def is_sequence_series(s: pd.Series) -> bool:
+    """
+    Returns True if the series has any list/tuple/set/array values.
+    """
+    if str(s.dtype) != "object":
+        return False
+
+    if any(isinstance(v, (list, tuple, set, np.ndarray)) for v in s.dropna().head().values):
+        return True
+    return False
+
+
+def is_json_serializable(s: pd.Series) -> bool:
+    """
+    Returns True if the object can be serialized to JSON.
+    """
+    try:
+        _ = json.dumps(s.dropna().head().values.tolist())
+        return True
+    except (TypeError, OverflowError, UnicodeDecodeError):
+        # these are the main serialization errors we expect
+        return False
+    except ValueError as ve:
+        # ...but we may get here if we have a series with duplicate index values
+        # "ValueError: Series index must be unique for orient='index'"
+        logger.debug(ve)
+        return False
diff --git a/src/dx/datatypes/numeric.py b/src/dx/datatypes/numeric.py
new file mode 100644
index 00000000..788d73c1
--- /dev/null
+++ b/src/dx/datatypes/numeric.py
@@ -0,0 +1,43 @@
+from decimal import Decimal
+
+import numpy as np
+import pandas as pd
+import structlog
+
+logger = structlog.get_logger(__name__)
+
+
+### Generator helper functions ###
+def generate_integer_series(num_rows: int) -> pd.Series:
+    return pd.Series([np.random.randint(-100, 100) for _ in range(num_rows)])
+
+
+def generate_float_series(num_rows: int) -> pd.Series:
+    return pd.Series([np.random.rand() for _ in range(num_rows)])
+
+
+def generate_decimal_series(num_rows: int) -> pd.Series:
+    return pd.Series([Decimal(np.random.rand()) for _ in range(num_rows)])
+
+
+def generate_complex_number_series(num_rows: int) -> pd.Series:
+    return pd.Series(
+        [complex(real=np.random.rand(), imag=np.random.rand()) for _ in range(num_rows)]
+    )
+
+
+### Handler helper functions ###
+def handle_complex_number_series(s: pd.Series) -> pd.Series:
+    types = (complex, np.complex)
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(f"series `{s.name}` has complex numbers; converting to real/imag string")
+        s = s.apply(lambda x: f"{x.real}+{x.imag}j" if isinstance(x, types) else x)
+    return s
+
+
+def handle_decimal_series(s: pd.Series) -> pd.Series:
+    types = (Decimal,)
+    if any(isinstance(v, types) for v in s.dropna().head().values):
+        logger.debug(f"series `{s.name}` has Decimals; converting to float")
+        s = s.astype(float)
+    return s
diff --git a/src/dx/datatypes/text.py b/src/dx/datatypes/text.py
new file mode 100644
index 00000000..c40271fe
--- /dev/null
+++ b/src/dx/datatypes/text.py
@@ -0,0 +1,31 @@
+import random
+import string
+
+import numpy as np
+import pandas as pd
+import structlog
+
+try:
+    from faker import Faker
+
+    fake = Faker()
+    FAKER_INSTALLED = True
+except ImportError:
+    FAKER_INSTALLED = False
+
+
+logger = structlog.get_logger(__name__)
+
+
+def generate_text_series(num_rows: int) -> pd.Series:
+    if not FAKER_INSTALLED:
+        logger.warning("faker is not installed, skipping text_column")
+        return np.nan
+
+    return pd.Series([fake.text() for _ in range(num_rows)])
+
+
+def generate_keyword_series(num_rows: int, num_letters: int = 2) -> pd.Series:
+    return pd.Series(
+        ["".join(random.sample(string.ascii_uppercase, num_letters)) for _ in range(num_rows)]
+    )
diff --git a/src/dx/formatters/main.py b/src/dx/formatters/main.py
index 20078265..4aa26d36 100644
--- a/src/dx/formatters/main.py
+++ b/src/dx/formatters/main.py
@@ -12,8 +12,12 @@
 from dx.sampling import get_df_dimensions, sample_if_too_big
 from dx.settings import settings
 from dx.types import DXDisplayMode
-from dx.utils.datatypes import to_dataframe
-from dx.utils.formatting import generate_metadata, is_default_index, normalize_index_and_columns
+from dx.utils.formatting import (
+    generate_metadata,
+    is_default_index,
+    normalize_index_and_columns,
+    to_dataframe,
+)
 from dx.utils.tracking import DXDF_CACHE, SUBSET_TO_DISPLAY_ID, DXDataFrame, get_db_connection
 
 logger = structlog.get_logger(__name__)
@@ -36,6 +40,7 @@ def datalink_processing(
     df: pd.DataFrame,
     default_index_used: bool,
     ipython_shell: Optional[InteractiveShell] = None,
+    with_ipython_display: bool = True,
 ):
     dxdf = DXDataFrame(df)
     logger.debug(f"{dxdf=}")
@@ -52,6 +57,7 @@ def datalink_processing(
         update=parent_display_id,
         display_id=dxdf.display_id,
         has_default_index=default_index_used,
+        with_ipython_display=with_ipython_display,
     )
 
     # this needs to happen after sending to the frontend
@@ -65,6 +71,7 @@ def datalink_processing(
 
 def handle_format(
     obj,
+    with_ipython_display: bool = True,
     ipython_shell: Optional[InteractiveShell] = None,
 ):
     ipython = ipython_shell or get_ipython()
@@ -81,6 +88,7 @@ def handle_format(
         payload, metadata = format_output(
             obj,
             has_default_index=default_index_used,
+            with_ipython_display=with_ipython_display,
         )
         return payload, metadata
 
@@ -89,11 +97,16 @@ def handle_format(
             obj,
             default_index_used,
             ipython_shell=ipython,
+            with_ipython_display=with_ipython_display,
         )
     except Exception as e:
         logger.debug(f"Error in datalink_processing: {e}")
         # fall back to default processing
-        payload, metadata = format_output(obj, has_default_index=default_index_used)
+        payload, metadata = format_output(
+            obj,
+            has_default_index=default_index_used,
+            with_ipython_display=with_ipython_display,
+        )
 
     return payload, metadata
 
@@ -148,6 +161,7 @@ def format_output(
     update: bool = False,
     display_id: Optional[str] = None,
     has_default_index: bool = True,
+    with_ipython_display: bool = True,
 ) -> tuple:
     display_id = display_id or str(uuid.uuid4())
 
@@ -169,14 +183,15 @@ def format_output(
     metadata = {settings.MEDIA_TYPE: metadata}
 
     # this needs to happen so we can update by display_id as needed
-    with pd.option_context("html.table_schema", settings.HTML_TABLE_SCHEMA):
-        logger.debug(f"displaying {settings.MEDIA_TYPE} payload in {display_id=}")
-        ipydisplay(
-            payload,
-            raw=True,
-            metadata=metadata,
-            display_id=display_id,
-            update=update,
-        )
+    if with_ipython_display:
+        with pd.option_context("html.table_schema", settings.HTML_TABLE_SCHEMA):
+            logger.debug(f"displaying {settings.MEDIA_TYPE} payload in {display_id=}")
+            ipydisplay(
+                payload,
+                raw=True,
+                metadata=metadata,
+                display_id=display_id,
+                update=update,
+            )
 
     return (payload, metadata)
diff --git a/src/dx/utils/__init__.py b/src/dx/utils/__init__.py
index 8123c46a..aa200cde 100644
--- a/src/dx/utils/__init__.py
+++ b/src/dx/utils/__init__.py
@@ -1,5 +1,2 @@
-from .datatypes import *
-from .date_time import *
 from .formatting import *
-from .geometry import *
 from .tracking import *
diff --git a/src/dx/utils/datatypes.py b/src/dx/utils/datatypes.py
deleted file mode 100644
index 7cfaafbc..00000000
--- a/src/dx/utils/datatypes.py
+++ /dev/null
@@ -1,387 +0,0 @@
-import ipaddress
-import json
-import random
-import string
-
-import numpy as np
-import pandas as pd
-import structlog
-
-from dx.utils import date_time, geometry
-
-try:
-    from faker import Faker
-
-    fake = Faker()
-    FAKER_INSTALLED = True
-except ImportError:
-    FAKER_INSTALLED = False
-
-
-logger = structlog.get_logger(__name__)
-
-
-DX_DATATYPES = {
-    "dtype_column": True,
-    "integer_column": True,
-    "float_column": True,
-    "datetime_column": True,
-    "time_delta_column": False,
-    "time_period_column": False,
-    "time_interval_column": False,
-    "text_column": False,
-    "keyword_column": True,
-    "dict_column": False,
-    "list_column": False,
-    "nested_tabular_column": False,
-    "latlon_point_column": False,
-    "filled_geojson_column": False,
-    "exterior_geojson_column": False,
-    "bytes_column": True,
-    "ipv4_address_column": False,
-    "ipv6_address_column": False,
-    "complex_number_column": False,
-}
-SORTED_DX_DATATYPES = sorted(list(DX_DATATYPES.keys()))
-
-
-def generate_integer_series(num_rows: int) -> pd.Series:
-    return pd.Series([np.random.randint(-100, 100) for _ in range(num_rows)])
-
-
-def generate_float_series(num_rows: int) -> pd.Series:
-    return pd.Series([np.random.rand() for _ in range(num_rows)])
-
-
-def generate_complex_number_series(num_rows: int) -> pd.Series:
-    return pd.Series(
-        [complex(real=np.random.rand(), imag=np.random.rand()) for _ in range(num_rows)]
-    )
-
-
-def generate_dtype_series(num_rows: int) -> pd.Series:
-    return pd.Series(
-        [random.choice([float, int, str, bool, set, tuple, dict, list]) for _ in range(num_rows)]
-    )
-
-
-def generate_text_series(num_rows: int) -> pd.Series:
-    if not FAKER_INSTALLED:
-        logger.warning("faker is not installed, skipping text_column")
-        return np.nan
-
-    return pd.Series([fake.text() for _ in range(num_rows)])
-
-
-def generate_keyword_series(num_rows: int, num_letters: int = 2) -> pd.Series:
-    return pd.Series(
-        ["".join(random.sample(string.ascii_uppercase, num_letters)) for _ in range(num_rows)]
-    )
-
-
-def generate_dict_series(num_rows: int) -> pd.Series:
-    return pd.Series(
-        [
-            {
-                "nested_property": random.choice(["apple", "banana", "orange", "pear"]),
-                "nested_other_property": random.randint(0, 10),
-                "nested_bool": random.choice([True, False]),
-            }
-            for _ in range(num_rows)
-        ]
-    )
-
-
-def generate_list_series(num_rows: int) -> pd.Series:
-    return pd.Series([[random.randint(0, 5) for _ in range(5)] for _ in range(num_rows)])
-
-
-def generate_bytes_series(num_rows: int, n_bytes: int = 10) -> pd.Series:
-    return pd.Series([np.random.bytes(n_bytes) for _ in range(num_rows)])
-
-
-def generate_nested_tabular_series(num_rows: int, num_nested_rows: int = 5, **kwargs) -> pd.Series:
-    return pd.Series(
-        [
-            random_dataframe(num_rows=num_nested_rows, **kwargs).to_dict("records")
-            for _ in range(num_rows)
-        ]
-    )
-
-
-def generate_ipv4_series(num_rows: int) -> pd.Series:
-    def random_ipv4():
-        address_str = ".".join(str(random.randint(0, 255)) for _ in range(4))
-        return ipaddress.ip_address(address_str)
-
-    return pd.Series([random_ipv4() for _ in range(num_rows)])
-
-
-def generate_ipv6_series(num_rows: int) -> pd.Series:
-    def random_ipv6():
-        address_str = ":".join(
-            str(hex(random.randint(0, 65_535))).replace("0x", "") for _ in range(8)
-        )
-        return ipaddress.ip_address(address_str)
-
-    return pd.Series([random_ipv6() for _ in range(num_rows)])
-
-
-def handle_complex_number_series(s: pd.Series) -> pd.Series:
-    types = (complex, np.complex)
-    if any(isinstance(v, types) for v in s.dropna().head().values):
-        logger.debug(f"series `{s.name}` has complex numbers; converting to real/imag string")
-        s = s.apply(lambda x: f"{x.real}+{x.imag}j" if isinstance(x, types) else x)
-    return s
-
-
-def handle_dict_series(s: pd.Series) -> pd.Series:
-    types = dict
-    if any(isinstance(v, types) for v in s.dropna().head().values):
-        logger.debug(f"series `{s.name}` has dicts; converting to json string")
-        s = s.apply(lambda x: json.dumps(x) if isinstance(x, types) else x)
-    return s
-
-
-def handle_dtype_series(s: pd.Series):
-    """
-    Casts dtypes as strings.
-    """
-    types = (type, np.dtype)
-    if any(isinstance(v, types) for v in s.dropna().head().values):
-        logger.debug(f"series `{s.name}` has types; converting to strings")
-        s = s.astype(str)
-    return s
-
-
-def handle_interval_series(s: pd.Series) -> pd.Series:
-    types = pd.Interval
-    if any(isinstance(v, types) for v in s.dropna().head().values):
-        logger.debug(f"series `{s.name}` has intervals; converting to left/right")
-        s = s.apply(lambda x: [x.left, x.right] if isinstance(x, types) else x)
-    return s
-
-
-def handle_ip_address_series(s: pd.Series) -> pd.Series:
-    types = (ipaddress.IPv4Address, ipaddress.IPv6Address)
-    if any(isinstance(v, types) for v in s.dropna().head().values):
-        logger.debug(f"series `{s.name}` has ip addresses; converting to strings")
-        s = s.astype(str)
-    return s
-
-
-def handle_sequence_series(s: pd.Series) -> pd.Series:
-    types = (list, tuple, set, np.ndarray)
-    if is_sequence_series(s):
-        logger.debug(f"series `{s.name}` has sequences; converting to comma-separated string")
-        s = s.apply(lambda x: ", ".join([str(val) for val in x] if isinstance(x, types) else x))
-    return s
-
-
-def is_sequence_series(s: pd.Series) -> bool:
-    """
-    Returns True if the series has any list/tuple/set/array values.
-    """
-    if str(s.dtype) != "object":
-        return False
-
-    if any(isinstance(v, (list, tuple, set, np.ndarray)) for v in s.dropna().head().values):
-        return True
-    return False
-
-
-def handle_unk_type_series(s: pd.Series) -> pd.Series:
-    if not is_json_serializable(s):
-        logger.debug(f"series `{s.name}` has non-JSON-serializable types; converting to string")
-        s = s.astype(str)
-    return s
-
-
-def is_json_serializable(s: pd.Series) -> bool:
-    """
-    Returns True if the object can be serialized to JSON.
-    """
-    try:
-        _ = json.dumps(s.dropna().head().values.tolist())
-        return True
-    except (TypeError, OverflowError, UnicodeDecodeError):
-        # these are the main serialization errors we expect
-        return False
-    except ValueError as ve:
-        # ...but we may get here if we have a series with duplicate index values
-        # "ValueError: Series index must be unique for orient='index'"
-        logger.debug(ve)
-        return False
-
-
-def has_numeric_strings(s: pd.Series) -> bool:
-    if not str(s.dtype) == "object":
-        return False
-    for v in s.dropna().head().values:
-        if str(v).isnumeric() or str(v).isdigit() or str(v).isdecimal():
-            return True
-    return False
-
-
-def quick_random_dataframe(
-    num_rows: int = 5,
-    num_cols: int = 2,
-    dtype: str = "float",
-    factor: float = 1.0,
-) -> pd.DataFrame:
-    """
-    Convenience function wrapping `pd.DataFrame(np.random.rand( num_rows, num_columns ))`
-    to create a dataframe of random 0.0-1.0 values.
-    """
-    data = np.random.rand(num_rows, num_cols) * factor
-    df = pd.DataFrame(data)
-    return df.astype(dtype, errors="ignore")
-
-
-def random_dataframe(num_rows: int = 5, **kwargs):  # noqa: C901
-
-    kwargs = kwargs or DX_DATATYPES
-    df = pd.DataFrame(index=list(range(num_rows)))
-
-    if kwargs.get("dtype_column"):
-        df["dtype_column"] = generate_dtype_series(num_rows)
-
-    # numeric columns
-    if kwargs.get("integer_column"):
-        df["integer_column"] = generate_integer_series(num_rows)
-
-    if kwargs.get("float_column"):
-        df["float_column"] = generate_float_series(num_rows)
-
-    if kwargs.get("complex_number_column"):
-        df["complex_number_column"] = generate_complex_number_series(num_rows)
-
-    # date/time columns
-    if kwargs.get("datetime_column"):
-        df["datetime_column"] = date_time.generate_datetime_series(num_rows)
-
-    if kwargs.get("time_delta_column"):
-        df["time_delta_column"] = date_time.generate_time_delta_series(num_rows)
-
-    if kwargs.get("time_period_column"):
-        df["time_period_column"] = date_time.generate_time_period_series(num_rows)
-
-    if kwargs.get("time_interval_column"):
-        df["time_interval_column"] = date_time.generate_time_interval_series(num_rows)
-
-    # string columns
-    if kwargs.get("text_column"):
-        df["text_column"] = generate_text_series(num_rows)
-
-    if kwargs.get("keyword_column"):
-        df["keyword_column"] = generate_keyword_series(num_rows)
-
-    # container columns
-    if kwargs.get("dict_column"):
-        df["dict_column"] = generate_dict_series(num_rows)
-
-    if kwargs.get("list_column"):
-        df["list_column"] = generate_list_series(num_rows)
-
-    if kwargs.get("nested_tabular_column"):
-        df["nested_tabular_column"] = generate_nested_tabular_series(
-            num_rows,
-            float_column=True,
-            keyword_column=True,
-        )
-
-    # geopandas/shapely columns
-    if kwargs.get("latlon_point_column"):
-        df["latlon_point_column"] = geometry.generate_latlon_series(num_rows)
-
-    if kwargs.get("filled_geojson_column"):
-        df["filled_geojson_column"] = geometry.generate_filled_geojson_series(num_rows)
-
-    if kwargs.get("exterior_geojson_column"):
-        df["exterior_geojson_column"] = geometry.generate_exterior_bounds_geojson_series(num_rows)
-
-    # extras
-    if kwargs.get("bytes_column"):
-        df["bytes_column"] = generate_bytes_series(num_rows)
-
-    if kwargs.get("ipv4_address_column"):
-        df["ipv4_address_column"] = generate_ipv4_series(num_rows)
-
-    if kwargs.get("ipv6_address_column"):
-        df["ipv6_address_column"] = generate_ipv6_series(num_rows)
-
-    return df
-
-
-def to_dataframe(obj) -> pd.DataFrame:
-    """
-    Converts an object to a pandas dataframe.
-    """
-    logger.debug(f"converting {type(obj)} to pd.DataFrame")
-
-    # handling for groupby operations returning pd.Series
-    index_reset_name = None
-    if is_groupby_series(obj):
-        orig_index_names = obj.index.names
-        index_reset_name = groupby_series_index_name(obj.index)
-        # this will convert a MultiIndex series to a flat DataFrame
-        obj = obj.reset_index(name=index_reset_name)
-        # ensure we keep the original index structure
-        obj.set_index(orig_index_names, inplace=True)
-
-    df = pd.DataFrame(obj)
-    return df
-
-
-def is_groupby_series(s: pd.Series) -> bool:
-    """
-    Checks if the pd.Series is the result of a groupby operation
-    by checking if the index is a MultiIndex and its name is
-    also used as a level in its index.
-
-    Example:
-
-    df = pd.DataFrame({
-        'foo': list('aaabbcddee'),
-        'bar': np.random.rand(1, 10)[0],
-        'baz': np.random.randint(-10, 10, 10)
-    })
-
-    group = df.groupby('foo').bar.value_counts()
-    print(group)
-    >>> foo  bar
-    a    0.304653    1
-         0.440604    1
-         0.445702    1
-    b    0.164294    1
-         0.296721    1
-    c    0.789996    1
-    d    0.550120    1
-         0.948220    1
-    e    0.223248    1
-         0.664756    1
-    Name: bar, dtype: int64
-
-    print(group.index.names)
-    >>> ['foo', 'bar']
-
-    print(group.name)
-    >>> bar
-    """
-    if not isinstance(s, pd.Series):
-        return False
-    if not isinstance(s.index, pd.MultiIndex):
-        return False
-    return s.name in s.index.names
-
-
-def groupby_series_index_name(index: pd.MultiIndex) -> str:
-    """
-    Creates a name for groupby operations to provide using a .reset_index()
-    based on the dataframe's MultiIndex names.
-
-    Example:
-    - A MultiIndex with level names of ["foo", "bar"] will return "foo.bar.value"
-    """
-    index_trail = ".".join([str(name) for name in index.names])
-    return f"{index_trail}.value"
diff --git a/src/dx/utils/formatting.py b/src/dx/utils/formatting.py
index 29ced5dc..c17a74d3 100644
--- a/src/dx/utils/formatting.py
+++ b/src/dx/utils/formatting.py
@@ -1,12 +1,86 @@
 import pandas as pd
 import structlog
 
+from dx.datatypes import date_time, geometry, misc, numeric
 from dx.settings import settings
-from dx.utils import datatypes, date_time, geometry
 
 logger = structlog.get_logger(__name__)
 
 
+def to_dataframe(obj) -> pd.DataFrame:
+    """
+    Converts an object to a pandas dataframe.
+    """
+    logger.debug(f"converting {type(obj)} to pd.DataFrame")
+
+    # handling for groupby operations returning pd.Series
+    index_reset_name = None
+    if is_groupby_series(obj):
+        orig_index_names = obj.index.names
+        index_reset_name = groupby_series_index_name(obj.index)
+        # this will convert a MultiIndex series to a flat DataFrame
+        obj = obj.reset_index(name=index_reset_name)
+        # ensure we keep the original index structure
+        obj.set_index(orig_index_names, inplace=True)
+
+    df = pd.DataFrame(obj)
+    return df
+
+
+def is_groupby_series(s: pd.Series) -> bool:
+    """
+    Checks if the pd.Series is the result of a groupby operation
+    by checking if the index is a MultiIndex and its name is
+    also used as a level in its index.
+
+    Example:
+
+    df = pd.DataFrame({
+        'foo': list('aaabbcddee'),
+        'bar': np.random.rand(1, 10)[0],
+        'baz': np.random.randint(-10, 10, 10)
+    })
+
+    group = df.groupby('foo').bar.value_counts()
+    print(group)
+    >>> foo  bar
+    a    0.304653    1
+         0.440604    1
+         0.445702    1
+    b    0.164294    1
+         0.296721    1
+    c    0.789996    1
+    d    0.550120    1
+         0.948220    1
+    e    0.223248    1
+         0.664756    1
+    Name: bar, dtype: int64
+
+    print(group.index.names)
+    >>> ['foo', 'bar']
+
+    print(group.name)
+    >>> bar
+    """
+    if not isinstance(s, pd.Series):
+        return False
+    if not isinstance(s.index, pd.MultiIndex):
+        return False
+    return s.name in s.index.names
+
+
+def groupby_series_index_name(index: pd.MultiIndex) -> str:
+    """
+    Creates a name for groupby operations to provide using a .reset_index()
+    based on the dataframe's MultiIndex names.
+
+    Example:
+    - A MultiIndex with level names of ["foo", "bar"] will return "foo.bar.value"
+    """
+    index_trail = ".".join([str(name) for name in index.names])
+    return f"{index_trail}.value"
+
+
 def is_default_index(index: pd.Index) -> bool:
     """
     Returns True if the index have no specified name,
@@ -117,17 +191,20 @@ def clean_column_values(s: pd.Series) -> pd.Series:
     """
     s = date_time.handle_time_period_series(s)
     s = date_time.handle_time_delta_series(s)
+    s = date_time.handle_date_series(s)
+
+    s = numeric.handle_decimal_series(s)
+    s = numeric.handle_complex_number_series(s)
 
-    s = datatypes.handle_dtype_series(s)
-    s = datatypes.handle_interval_series(s)
-    s = datatypes.handle_ip_address_series(s)
-    s = datatypes.handle_complex_number_series(s)
+    s = misc.handle_dtype_series(s)
+    s = misc.handle_interval_series(s)
+    s = misc.handle_ip_address_series(s)
 
     s = geometry.handle_geometry_series(s)
 
-    s = datatypes.handle_dict_series(s)
-    s = datatypes.handle_sequence_series(s)
-    s = datatypes.handle_unk_type_series(s)
+    s = misc.handle_dict_series(s)
+    s = misc.handle_sequence_series(s)
+    s = misc.handle_unk_type_series(s)
     return s
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 205c0678..b4c134b0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,9 +5,9 @@
 from IPython.terminal.interactiveshell import TerminalInteractiveShell
 from IPython.testing import tools
 
+from dx.datatypes.main import random_dataframe
 from dx.settings import get_settings
 from dx.types import DEXFilterSettings
-from dx.utils.datatypes import random_dataframe
 from dx.utils.formatting import normalize_index_and_columns
 from dx.utils.tracking import DXDataFrame
 
diff --git a/tests/test_dataresource.py b/tests/test_dataresource.py
index b4ce0ed8..e1bd1249 100644
--- a/tests/test_dataresource.py
+++ b/tests/test_dataresource.py
@@ -2,10 +2,10 @@
 
 import pytest
 
+from dx.datatypes.main import quick_random_dataframe
 from dx.formatters.main import format_output, generate_body
 from dx.formatters.simple import get_dataresource_settings
 from dx.settings import settings_context
-from dx.utils.datatypes import quick_random_dataframe
 
 dataresource_settings = get_dataresource_settings()
 
diff --git a/tests/test_datatype_handling.py b/tests/test_datatype_handling.py
index adbd29be..5e070ef3 100644
--- a/tests/test_datatype_handling.py
+++ b/tests/test_datatype_handling.py
@@ -1,6 +1,6 @@
 import pytest
 
-from dx.utils.datatypes import SORTED_DX_DATATYPES, random_dataframe
+from dx.datatypes.main import SORTED_DX_DATATYPES, random_dataframe
 from dx.utils.formatting import clean_column_values
 
 
diff --git a/tests/test_datatypes.py b/tests/test_datatypes.py
index e235edb3..cd15cbde 100644
--- a/tests/test_datatypes.py
+++ b/tests/test_datatypes.py
@@ -4,55 +4,28 @@
 - hash the dataframe for tracking
 - write to the database for tracking/filtering
 """
+from datetime import datetime
 
 import duckdb
+import numpy as np
 import pandas as pd
 import pytest
 from pandas.io.json import build_table_schema
 from pandas.util import hash_pandas_object
 
-from dx.formatters.main import generate_body
-from dx.settings import settings_context
-from dx.utils.datatypes import (
+from dx.datatypes import date_time, geometry, main, misc, numeric, text
+from dx.datatypes.main import (
     DX_DATATYPES,
     SORTED_DX_DATATYPES,
-    groupby_series_index_name,
     quick_random_dataframe,
     random_dataframe,
-    to_dataframe,
 )
+from dx.formatters.main import generate_body
+from dx.settings import settings_context
 from dx.utils.formatting import clean_column_values
 from dx.utils.tracking import generate_df_hash
 
 
-@pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES)
-def test_df_generator(dtype: str, num_rows: int = 5):
-    params = {dt: False for dt in SORTED_DX_DATATYPES}
-    params[dtype] = True
-    df = random_dataframe(num_rows=num_rows, **params)
-    assert len(df) == num_rows
-    assert isinstance(df[dtype], pd.Series)
-    assert df[dtype].notnull().all()
-
-
-def test_random_dataframe_has_default_data(num_rows: int = 5):
-    df = random_dataframe(num_rows=num_rows)
-    assert len(df) == num_rows
-    default_enabled_columns = [column for column, enabled in DX_DATATYPES.items() if enabled]
-    assert len(df.columns) == len(default_enabled_columns)
-    for col in default_enabled_columns:
-        assert col in df.columns
-        assert df[col].notnull().all()
-
-
-def test_quick_random_dataframe_has_default_data():
-    df = quick_random_dataframe()
-    assert df.shape[0] >= 1
-    assert df.shape[1] >= 1
-    for col in df.columns:
-        assert df[col].notnull().all()
-
-
 @pytest.mark.xfail(reason="only for dev")
 @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES)
 def test_data_types_with_build_table_schema(dtype: str):
@@ -69,8 +42,9 @@ def test_data_types_with_build_table_schema(dtype: str):
     assert isinstance(schema, dict)
 
 
+@pytest.mark.parametrize("display_mode", ["simple", "enhanced"])
 @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES)
-def test_generate_simple_body(dtype: str):
+def test_generate_body(dtype: str, display_mode: str):
     """
     Test that we've correctly handled data types before building the schema and metadata for
     the DXDisplayFormatter.
@@ -79,24 +53,7 @@ def test_generate_simple_body(dtype: str):
     params[dtype] = True
     df = random_dataframe(**params)
     try:
-        with settings_context(display_mode="simple"):
-            payload = generate_body(df)
-    except Exception as e:
-        assert False, f"{dtype} failed with {e}"
-    assert isinstance(payload, dict)
-
-
-@pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES)
-def test_generate_enhanced_body(dtype: str):
-    """
-    Test that we've correctly handled data types before building the schema and metadata for
-    the DXDisplayFormatter.
-    """
-    params = {dt: False for dt in SORTED_DX_DATATYPES}
-    params[dtype] = True
-    df = random_dataframe(**params)
-    try:
-        with settings_context(display_mode="enhanced"):
+        with settings_context(display_mode=display_mode):
             payload = generate_body(df)
     except Exception as e:
         assert False, f"{dtype} failed with {e}"
@@ -140,8 +97,7 @@ def test_generate_df_hash(dtype: str):
 @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES)
 def test_to_sql(dtype: str, sample_db_connection: duckdb.DuckDBPyConnection):
     """
-    DEV: Test which data types pass/fail when passed directly through .to_sql()
-    with the sqlalchemy engine.
+    DEV: Test which data types pass/fail when registered directly to duckdb.
     """
     params = {dt: False for dt in SORTED_DX_DATATYPES}
     params[dtype] = True
@@ -160,7 +116,7 @@ def test_to_sql(dtype: str, sample_db_connection: duckdb.DuckDBPyConnection):
 @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES)
 def test_store_in_db(dtype: str, sample_db_connection: duckdb.DuckDBPyConnection):
     """
-    Test that we've correctly handled data types before storing in sqlite.
+    Test that we've correctly handled data types before storing in duckdb.
     """
     params = {dt: False for dt in SORTED_DX_DATATYPES}
     params[dtype] = True
@@ -179,55 +135,224 @@ def test_store_in_db(dtype: str, sample_db_connection: duckdb.DuckDBPyConnection
     assert num_rows == df.shape[0]
 
 
-def test_series_is_converted(sample_random_dataframe: pd.Series):
-    """
-    Test that a basic conversion from pd.Series to pd.Dataframe
-    keeps the original index and uses the Series name as its only column.
-    """
-    s: pd.Series = sample_random_dataframe.keyword_column
-    df = to_dataframe(s)
-    assert df.index.equals(s.index)
-    assert df.columns[0] == s.name
-
-
-def test_multiindex_series_left_alone(sample_multiindex_series: pd.Series):
-    """
-    Test no renaming is done with a MultiIndex pd.Series if their
-    name doesn't appear in the MultiIndex names.
-    """
-    index = sample_multiindex_series.index
-    df = to_dataframe(sample_multiindex_series)
-    assert df.index.names == index.names
-    assert df.columns[0] == sample_multiindex_series.name
-
-
-def test_groupby_series_resets(sample_groupby_series: pd.Series):
-    """
-    Test we're resetting the index of a pd.Series created from a groupby
-    operation by using the combination of index names.
-    """
-    index = sample_groupby_series.index
-    df = to_dataframe(sample_groupby_series)
-    assert df.index.names == index.names
-    assert df.columns[0] == groupby_series_index_name(index)
-    assert df.columns[0] != sample_groupby_series.name
-
-
-def test_dataframe_index_left_alone(sample_random_dataframe: pd.DataFrame):
-    """
-    Ensure we don't alter the structure of a dataframe during
-    initial dataframe conversion.
-    """
-    df = to_dataframe(sample_random_dataframe)
-    assert df.index.equals(sample_random_dataframe.index)
-    assert df.columns.equals(sample_random_dataframe.columns)
-
-
-def test_groupby_dataframe_index_left_alone(sample_groupby_dataframe: pd.DataFrame):
-    """
-    Ensure we don't alter the structure of a dataframe
-    with MultiIndexes during initial dataframe conversion.
-    """
-    df = to_dataframe(sample_groupby_dataframe)
-    assert df.index.equals(sample_groupby_dataframe.index)
-    assert df.columns.equals(sample_groupby_dataframe.columns)
+class TestDataFrameGeneration:
+    """Basic testing to make sure our dataframe generation provides data with default arguments."""
+
+    @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES)
+    def test_df_generator(self, dtype: str, num_rows: int = 5):
+        params = {dt: False for dt in SORTED_DX_DATATYPES}
+        params[dtype] = True
+        df = random_dataframe(num_rows=num_rows, **params)
+        assert len(df) == num_rows
+        assert isinstance(df[dtype], pd.Series)
+        assert df[dtype].notnull().all()
+
+    def test_random_dataframe_has_default_data(self, num_rows: int = 5):
+        df = random_dataframe(num_rows=num_rows)
+        assert len(df) == num_rows
+        default_enabled_columns = [column for column, enabled in DX_DATATYPES.items() if enabled]
+        assert len(df.columns) == len(default_enabled_columns)
+        for col in default_enabled_columns:
+            # if this fails, that means something was added to DX_DATATYPES that doesn't match
+            # the default arguments of random_dataframe()
+            assert col in df.columns
+            assert df[col].notnull().all()
+
+    def test_quick_random_dataframe_has_default_data(self):
+        df = quick_random_dataframe()
+        assert df.shape[0] >= 1
+        assert df.shape[1] >= 1
+        for col in df.columns:
+            assert df[col].notnull().all()
+
+
+class TestDatatypeHandling:
+    def test_integer_series_left_alone(self):
+        series = numeric.generate_integer_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "int64"
+        assert isinstance(
+            series.values[0], (int, np.int64)
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_float_series_left_alone(self):
+        series = numeric.generate_float_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "float64"
+        assert isinstance(
+            series.values[0], (float, np.float64)
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_boolean_series_left_alone(self):
+        series = misc.generate_boolean_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "bool"
+        assert isinstance(
+            series.values[0], (bool, np.bool_)
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_dtype_series_converted(self):
+        series = misc.generate_dtype_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_decimal_series_converted(self):
+        series = numeric.generate_decimal_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "float64"
+        assert isinstance(
+            series.values[0], (float, np.float64)
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_datetime_series_left_alone(self):
+        series = date_time.generate_datetime_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "datetime64[ns]"
+        assert isinstance(
+            series.values[0], (datetime, np.datetime64)
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_date_series_converted(self):
+        # datetime.date values are converted to pd.Timestamp
+        series = date_time.generate_date_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "datetime64[ns]"
+        assert isinstance(
+            series.values[0], (datetime, np.datetime64)
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_time_series_converted(self):
+        # datetime.time values are converted to strings
+        series = date_time.generate_time_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_timedelta_series_converted(self):
+        # time delta values are converted to floats (total seconds)
+        series = date_time.generate_time_delta_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "float64"
+        assert isinstance(
+            series.values[0], (float, np.float64)
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_time_period_series_converted(self):
+        series = date_time.generate_time_period_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_time_interval_series_converted(self):
+        series = date_time.generate_time_interval_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_text_series_left_alone(self):
+        series = text.generate_text_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_keyword_series_left_alone(self):
+        series = text.generate_keyword_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_dict_series_converted(self):
+        # dictionary values are JSON-stringifed
+        series = misc.generate_dict_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_list_series_converted(self):
+        # sequence values are cast as strings
+        series = misc.generate_list_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_nested_tabular_series_converted(self):
+        # lists of dictionaries are JSON-stringified
+        series = main.generate_nested_tabular_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_latlon_point_series_converted(self):
+        # latlon point values are converted to GeoJSON strings
+        series = geometry.generate_latlon_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_filled_geojson_series_converted(self):
+        # shapely.geometry values are converted to GeoJSON strings
+        # by handle_geometry_series()
+        series = geometry.generate_filled_geojson_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_exterior_bounds_geojson_series_converted(self):
+        # shapely.geometry exterior values are converted to GeoJSON strings
+        # by handle_geometry_series()
+        series = geometry.generate_exterior_bounds_geojson_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_bytes_series_converted(self):
+        # bytes values are converted to strings
+        series = misc.generate_bytes_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_ipv4_address_series_converted(self):
+        # IPv4Address values are converted to strings
+        series = misc.generate_ipv4_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
+
+    def test_ipv6_address_series_converted(self):
+        # IPv6Address values are converted to strings
+        series = misc.generate_ipv6_series(5)
+        series = clean_column_values(series)
+        assert series.dtype == "object"
+        assert isinstance(
+            series.values[0], str
+        ), f"cleaned series value is {type(series.values[0])}"
diff --git a/tests/test_dx.py b/tests/test_dx.py
index 59571bed..d45044a5 100644
--- a/tests/test_dx.py
+++ b/tests/test_dx.py
@@ -2,10 +2,10 @@
 
 import pytest
 
+from dx.datatypes.main import quick_random_dataframe
 from dx.formatters.enhanced import get_dx_settings
 from dx.formatters.main import format_output, generate_body
 from dx.settings import settings_context
-from dx.utils.datatypes import quick_random_dataframe
 
 dx_settings = get_dx_settings()
 
diff --git a/tests/test_formatting.py b/tests/test_formatting.py
index 78ae441f..1979ca31 100644
--- a/tests/test_formatting.py
+++ b/tests/test_formatting.py
@@ -7,7 +7,7 @@
 from dx.formatters.main import DXDisplayFormatter, generate_body, handle_format
 from dx.formatters.simple import get_dataresource_settings
 from dx.settings import get_settings, settings_context
-from dx.utils.formatting import normalize_index_and_columns
+from dx.utils.formatting import groupby_series_index_name, normalize_index_and_columns, to_dataframe
 from dx.utils.tracking import DXDF_CACHE
 
 dataresource_settings = get_dataresource_settings()
@@ -365,3 +365,54 @@ def test_sample_resampled_multi_groupby_dataframe(self, sample_random_dataframe:
         assert "keyword_column.value" in clean_df.columns
         assert "integer_column" not in clean_df.columns
         assert "integer_column.value" in clean_df.columns
+
+
+class TestDataFrameConversion:
+    def test_series_is_converted(self, sample_random_dataframe: pd.Series):
+        """
+        Test that a basic conversion from pd.Series to pd.Dataframe
+        keeps the original index and uses the Series name as its only column.
+        """
+        s: pd.Series = sample_random_dataframe.keyword_column
+        df = to_dataframe(s)
+        assert df.index.equals(s.index)
+        assert df.columns[0] == s.name
+
+    def test_multiindex_series_left_alone(self, sample_multiindex_series: pd.Series):
+        """
+        Test no renaming is done with a MultiIndex pd.Series if their
+        name doesn't appear in the MultiIndex names.
+        """
+        index = sample_multiindex_series.index
+        df = to_dataframe(sample_multiindex_series)
+        assert df.index.names == index.names
+        assert df.columns[0] == sample_multiindex_series.name
+
+    def test_groupby_series_resets(self, sample_groupby_series: pd.Series):
+        """
+        Test we're resetting the index of a pd.Series created from a groupby
+        operation by using the combination of index names.
+        """
+        index = sample_groupby_series.index
+        df = to_dataframe(sample_groupby_series)
+        assert df.index.names == index.names
+        assert df.columns[0] == groupby_series_index_name(index)
+        assert df.columns[0] != sample_groupby_series.name
+
+    def test_dataframe_index_left_alone(self, sample_random_dataframe: pd.DataFrame):
+        """
+        Ensure we don't alter the structure of a dataframe during
+        initial dataframe conversion.
+        """
+        df = to_dataframe(sample_random_dataframe)
+        assert df.index.equals(sample_random_dataframe.index)
+        assert df.columns.equals(sample_random_dataframe.columns)
+
+    def test_groupby_dataframe_index_left_alone(self, sample_groupby_dataframe: pd.DataFrame):
+        """
+        Ensure we don't alter the structure of a dataframe
+        with MultiIndexes during initial dataframe conversion.
+        """
+        df = to_dataframe(sample_groupby_dataframe)
+        assert df.index.equals(sample_groupby_dataframe.index)
+        assert df.columns.equals(sample_groupby_dataframe.columns)