From 816a165a7f27d599c3171de97181cb95e74d6593 Mon Sep 17 00:00:00 2001 From: Dave Shoup Date: Thu, 13 Oct 2022 17:01:53 -0400 Subject: [PATCH] Closes #47 - updated datatype handling - Decimal, datetime.date, datetime.time (#70) * add Decimal handler and generator functions; clean up random_dataframe() arguments and add decimal_column/date_column/time_column * add datetime.date and datetime.time generators and handlers * check for and handle decimals and datetime.dates by default * return gpd.GeoSeries instead of GeometryArray * add boolean series generator option * add datatype imports with new directory structure * ignore flake8 C901 - "too complex" * add datatype compatibility helpers * add optional with_ipython_display argument to prevent calling IPython.display() on an object that goes through handle_format() --- setup.cfg | 2 + src/dx/__init__.py | 1 + src/dx/datatypes/__init__.py | 7 + src/dx/datatypes/compatibility.py | 167 ++++++++++ src/dx/{utils => datatypes}/date_time.py | 36 +++ src/dx/{utils => datatypes}/geometry.py | 2 +- src/dx/datatypes/main.py | 177 +++++++++++ src/dx/datatypes/misc.py | 140 ++++++++ src/dx/datatypes/numeric.py | 43 +++ src/dx/datatypes/text.py | 31 ++ src/dx/formatters/main.py | 39 ++- src/dx/utils/__init__.py | 3 - src/dx/utils/datatypes.py | 387 ----------------------- src/dx/utils/formatting.py | 93 +++++- tests/conftest.py | 2 +- tests/test_dataresource.py | 2 +- tests/test_datatype_handling.py | 2 +- tests/test_datatypes.py | 339 +++++++++++++------- tests/test_dx.py | 2 +- tests/test_formatting.py | 53 +++- 20 files changed, 1005 insertions(+), 523 deletions(-) create mode 100644 src/dx/datatypes/__init__.py create mode 100644 src/dx/datatypes/compatibility.py rename src/dx/{utils => datatypes}/date_time.py (69%) rename src/dx/{utils => datatypes}/geometry.py (98%) create mode 100644 src/dx/datatypes/main.py create mode 100644 src/dx/datatypes/misc.py create mode 100644 src/dx/datatypes/numeric.py create mode 100644 src/dx/datatypes/text.py delete mode 100644 src/dx/utils/datatypes.py diff --git a/setup.cfg b/setup.cfg index 501a9624..f1f9c9b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,8 @@ select = # docstrings must be triple-quoted, via flake8-docstrings D300 ignore = + # "Too complex" + C901, # Extra space in brackets E20, E203, diff --git a/src/dx/__init__.py b/src/dx/__init__.py index 342b7ecd..cf5a5a73 100644 --- a/src/dx/__init__.py +++ b/src/dx/__init__.py @@ -1,4 +1,5 @@ from .comms import * +from .datatypes import * from .dx import * from .formatters import * from .loggers import * diff --git a/src/dx/datatypes/__init__.py b/src/dx/datatypes/__init__.py new file mode 100644 index 00000000..4b2cd633 --- /dev/null +++ b/src/dx/datatypes/__init__.py @@ -0,0 +1,7 @@ +from .compatibility import * +from .date_time import * +from .geometry import * +from .main import * +from .misc import * +from .numeric import * +from .text import * diff --git a/src/dx/datatypes/compatibility.py b/src/dx/datatypes/compatibility.py new file mode 100644 index 00000000..6ef53a84 --- /dev/null +++ b/src/dx/datatypes/compatibility.py @@ -0,0 +1,167 @@ +import traceback +from typing import Any + +import pandas as pd +from pandas.io.json import build_table_schema + +from dx.settings import get_settings + +settings = get_settings() + + +def test_compatibility(value: Any, as_dataframe: bool = True) -> dict: + """ + Convenience function to test the compatibility of a given object + with the different steps involved with the dx display modes. + - pandas.io.json.build_table_schema (https://github.com/pandas-dev/pandas/blob/main/pandas/io/json/_table_schema.py) + - jupyter_client.jsonutil.json_clean (https://github.com/jupyter/jupyter_client/blob/main/jupyter_client/jsonutil.py) + - duckdb conn.register + - final dx output type + """ + result = {} + result.update(test_build_table_schema(value)) + result.update(test_json_clean(value)) + result.update(test_db_write(value)) + result.update(test_dx_handling(value)) + if as_dataframe: + return pd.DataFrame(result).transpose() + return result + + +def test_build_table_schema(value: Any, as_dataframe: bool = False) -> dict: + """ + Convenience function to test the compatibility of a given object + with the pandas.io.json.build_table_schema function, which + is called to set up the initial column schema during dx formatting. + """ + df = pd.DataFrame({"test": [value]}) + result = {} + + try: + schema = build_table_schema(df, index=False) + fields = schema["fields"] + field_type = [ + field_schema["type"] for field_schema in fields if field_schema["name"] == "test" + ][0] + result["pandas.io.json.build_table_schema"] = { + "success": True, + "type": field_type, + } + except Exception as e: + result["pandas.io.json.build_table_schema"] = { + "error": str(e), + "success": False, + "traceback": traceback.format_exc(), + } + + if as_dataframe: + return pd.DataFrame(result).transpose() + return result + + +def test_json_clean(value: Any, as_dataframe: bool = False) -> dict: + """ + Convenience function to test the compatibility of a given object + with the jupyter_client.jsonutil.json_clean function, which + is called during IPython.display after dx formatting. + """ + df = pd.DataFrame({"test": [value]}) + result = {} + + try: + from jupyter_client.jsonutil import json_clean + + clean_json = json_clean(df.to_dict("records")) + clean_json_value = clean_json[0]["test"] + result["jupyter_client.jsonutil.json_clean"] = { + "success": True, + "type": type(clean_json_value), + "value": clean_json_value, + } + except Exception as e: + result["jupyter_client.jsonutil.json_clean"] = { + "error": str(e), + "success": False, + "traceback": traceback.format_exc(), + } + + if as_dataframe: + return pd.DataFrame(result).transpose() + return result + + +def test_db_write(value: Any, as_dataframe: bool = False) -> dict: + """ + Convenience function to test the compatibility of a given object + inside a pandas DataFrame during registration with a duckdb connection, + which is used during Datalink-enabled dataframe tracking for + push-down filtering. + """ + from dx.utils.tracking import get_db_connection # circular import + + df = pd.DataFrame({"test": [value]}) + result = {} + + db_connection = get_db_connection() + try: + db_connection.register("test", df) + db_df = db_connection.execute("SELECT * FROM test").df() + db_df_value = db_df.iloc[0]["test"] + result["duckdb.conn.register"] = { + "type": type(db_df_value), + "success": True, + "value": db_df_value, + } + except Exception as e: + result["duckdb.conn.register"] = { + "error": str(e), + "success": False, + "traceback": traceback.format_exc(), + } + + if as_dataframe: + return pd.DataFrame(result).transpose() + return result + + +def test_dx_handling(value: Any, as_dataframe: bool = False) -> dict: + """ + Convenience function to test the compatibility of a given object + inside a pandas DataFrame through the entire dx formatting + and data type handling process + """ + from dx.formatters.main import handle_format # circular import + + df = pd.DataFrame({"test": [value]}) + result = {} + + try: + payload, _ = handle_format(df, with_ipython_display=False) + + if settings.DISPLAY_MODE == "simple": + dx_value = payload[settings.MEDIA_TYPE]["data"][0]["test"] + if settings.DISPLAY_MODE == "enhanced": + dx_value = payload[settings.MEDIA_TYPE]["data"][0][0] + + dx_schema_fields = payload[settings.MEDIA_TYPE]["schema"]["fields"] + # should only be two fields here by default: `index` and `test` + # but we wanted to run the entire formatting process, which doesn't need + # an option to disable `index` from being included + dx_schema_type = [field["type"] for field in dx_schema_fields if field["name"] == "test"][0] + + result["dx.handle_format"] = { + "type": type(dx_value), + "success": True, + "value": dx_value, + "schema_type": dx_schema_type, + } + except Exception as e: + result["dx.handle_format"] = { + "error": str(e), + "success": False, + "traceback": traceback.format_exc(), + } + + if as_dataframe: + return pd.DataFrame(result).transpose() + return result diff --git a/src/dx/utils/date_time.py b/src/dx/datatypes/date_time.py similarity index 69% rename from src/dx/utils/date_time.py rename to src/dx/datatypes/date_time.py index 4038ba27..73e119c1 100644 --- a/src/dx/utils/date_time.py +++ b/src/dx/datatypes/date_time.py @@ -21,6 +21,24 @@ def generate_datetime_series(num_rows: int) -> pd.Series: ) +def generate_date_series(num_rows: int) -> pd.Series: + return pd.Series( + [ + (pd.Timestamp("now") + pd.Timedelta(f"{np.random.randint(-1000, 1000)} hours")).date() + for _ in range(num_rows) + ] + ) + + +def generate_time_series(num_rows: int) -> pd.Series: + return pd.Series( + [ + (pd.Timestamp("now") + pd.Timedelta(f"{np.random.randint(-1000, 1000)} hours")).time() + for _ in range(num_rows) + ] + ) + + def generate_time_period_series(num_rows: int) -> pd.Series: return pd.Series( [ @@ -70,6 +88,24 @@ def handle_time_delta_series(s: pd.Series) -> pd.Series: return s +def handle_date_series(s: pd.Series) -> pd.Series: + types = (datetime.date,) + if any(isinstance(v, types) for v in s.dropna().head().values): + logger.debug( + f"series `{s.name}` has datetime.date values; converting with pd.to_datetime()" + ) + s = pd.to_datetime(s) + return s + + +def handle_time_series(s: pd.Series) -> pd.Series: + types = (datetime.time,) + if any(isinstance(v, types) for v in s.dropna().head().values): + logger.debug(f"series `{s.name}` has datetime.time values; converting to string") + s = s.astype(str) + return s + + def is_datetime_series(s: pd.Series) -> bool: if str(s.dtype) in ("int", "float", "bool", "category", "period", "interval"): return False diff --git a/src/dx/utils/geometry.py b/src/dx/datatypes/geometry.py similarity index 98% rename from src/dx/utils/geometry.py rename to src/dx/datatypes/geometry.py index 83d65631..324c27e3 100644 --- a/src/dx/utils/geometry.py +++ b/src/dx/datatypes/geometry.py @@ -25,7 +25,7 @@ def generate_latlon_series(num_rows: int): lats = [random.randint(-90, 89) + np.random.rand() for _ in range(num_rows)] lons = [random.randint(-180, 179) + np.random.rand() for _ in range(num_rows)] - return gpd.points_from_xy(lons, lats) + return gpd.GeoSeries(gpd.points_from_xy(lons, lats)) def generate_filled_geojson_series( diff --git a/src/dx/datatypes/main.py b/src/dx/datatypes/main.py new file mode 100644 index 00000000..00379f9c --- /dev/null +++ b/src/dx/datatypes/main.py @@ -0,0 +1,177 @@ +import numpy as np +import pandas as pd +import structlog + +from dx.datatypes import date_time, geometry, misc, numeric, text + +logger = structlog.get_logger(__name__) + +# this is primarily used for testing to match the optional +# data types used for random dataframe generation, +# and should match the keyword arguments available in `random_dataframe()`` +DX_DATATYPES = { + "dtype_column": True, + "integer_column": True, + "float_column": True, + "bool_column": False, + "decimal_column": False, + "datetime_column": True, + "date_column": False, + "time_column": False, + "time_delta_column": False, + "time_period_column": False, + "time_interval_column": False, + "text_column": False, + "keyword_column": True, + "dict_column": False, + "list_column": False, + "nested_tabular_column": False, + "latlon_point_column": False, + "filled_geojson_column": False, + "exterior_geojson_column": False, + "bytes_column": True, + "ipv4_address_column": False, + "ipv6_address_column": False, + "complex_number_column": False, +} +# specifically used for pytest.mark.parametrize ordering +SORTED_DX_DATATYPES = sorted(list(DX_DATATYPES.keys())) + + +def quick_random_dataframe( + num_rows: int = 5, + num_cols: int = 2, + dtype: str = "float", + factor: float = 1.0, +) -> pd.DataFrame: + """ + Convenience function wrapping `pd.DataFrame(np.random.rand( num_rows, num_columns ))` + to create a dataframe of random 0.0-1.0 values. + """ + data = np.random.rand(num_rows, num_cols) * factor + df = pd.DataFrame(data) + return df.astype(dtype, errors="ignore") + + +def random_dataframe( + num_rows: int = 5, + dtype_column: bool = True, + integer_column: bool = True, + float_column: bool = True, + bool_column: bool = False, + decimal_column: bool = False, + datetime_column: bool = True, + date_column: bool = False, + time_column: bool = False, + time_delta_column: bool = False, + time_period_column: bool = False, + time_interval_column: bool = False, + text_column: bool = False, + keyword_column: bool = True, + dict_column: bool = False, + list_column: bool = False, + nested_tabular_column: bool = False, + latlon_point_column: bool = False, + filled_geojson_column: bool = False, + exterior_geojson_column: bool = False, + bytes_column: bool = True, + ipv4_address_column: bool = False, + ipv6_address_column: bool = False, + complex_number_column: bool = False, +): # noqa: C901 + """ + Convenience function to generate a dataframe of `num_rows` length + with mixed data types. + """ + df = pd.DataFrame(index=list(range(num_rows))) + + if dtype_column: + df["dtype_column"] = misc.generate_dtype_series(num_rows) + + if bool_column: + df["bool_column"] = misc.generate_boolean_series(num_rows) + + # numeric columns + if integer_column: + df["integer_column"] = numeric.generate_integer_series(num_rows) + + if float_column: + df["float_column"] = numeric.generate_float_series(num_rows) + + if decimal_column: + df["decimal_column"] = numeric.generate_decimal_series(num_rows) + + if complex_number_column: + df["complex_number_column"] = numeric.generate_complex_number_series(num_rows) + + # date/time columns + if datetime_column: + df["datetime_column"] = date_time.generate_datetime_series(num_rows) + + if date_column: + df["date_column"] = date_time.generate_date_series(num_rows) + + if time_column: + df["time_column"] = date_time.generate_time_series(num_rows) + + if time_delta_column: + df["time_delta_column"] = date_time.generate_time_delta_series(num_rows) + + if time_period_column: + df["time_period_column"] = date_time.generate_time_period_series(num_rows) + + if time_interval_column: + df["time_interval_column"] = date_time.generate_time_interval_series(num_rows) + + # string columns + if text_column: + df["text_column"] = text.generate_text_series(num_rows) + + if keyword_column: + df["keyword_column"] = text.generate_keyword_series(num_rows) + + # container columns + if dict_column: + df["dict_column"] = misc.generate_dict_series(num_rows) + + if list_column: + df["list_column"] = misc.generate_list_series(num_rows) + + if nested_tabular_column: + df["nested_tabular_column"] = generate_nested_tabular_series( + num_rows, + float_column=True, + keyword_column=True, + ) + + # geopandas/shapely columns + if latlon_point_column: + df["latlon_point_column"] = geometry.generate_latlon_series(num_rows) + + if filled_geojson_column: + df["filled_geojson_column"] = geometry.generate_filled_geojson_series(num_rows) + + if exterior_geojson_column: + df["exterior_geojson_column"] = geometry.generate_exterior_bounds_geojson_series(num_rows) + + # extras + if bytes_column: + df["bytes_column"] = misc.generate_bytes_series(num_rows) + + if ipv4_address_column: + df["ipv4_address_column"] = misc.generate_ipv4_series(num_rows) + + if ipv6_address_column: + df["ipv6_address_column"] = misc.generate_ipv6_series(num_rows) + + return df + + +# not adding this to datatypes/misc.py due to circular import +def generate_nested_tabular_series(num_rows: int, num_nested_rows: int = 5, **kwargs) -> pd.Series: + return pd.Series( + [ + random_dataframe(num_rows=num_nested_rows, **kwargs).to_dict("records") + for _ in range(num_rows) + ] + ) diff --git a/src/dx/datatypes/misc.py b/src/dx/datatypes/misc.py new file mode 100644 index 00000000..fdbd22f8 --- /dev/null +++ b/src/dx/datatypes/misc.py @@ -0,0 +1,140 @@ +import ipaddress +import json +import random + +import numpy as np +import pandas as pd +import structlog + +logger = structlog.get_logger(__name__) + + +### Generator helper functions ### +def generate_boolean_series(num_rows: int) -> pd.Series: + return pd.Series([random.choice([True, False]) for _ in range(num_rows)]) + + +def generate_dtype_series(num_rows: int) -> pd.Series: + return pd.Series( + [random.choice([float, int, str, bool, set, tuple, dict, list]) for _ in range(num_rows)] + ) + + +def generate_dict_series(num_rows: int) -> pd.Series: + return pd.Series( + [ + { + "nested_property": random.choice(["apple", "banana", "orange", "pear"]), + "nested_other_property": random.randint(0, 10), + "nested_bool": random.choice([True, False]), + } + for _ in range(num_rows) + ] + ) + + +def generate_list_series(num_rows: int) -> pd.Series: + return pd.Series([[random.randint(0, 5) for _ in range(5)] for _ in range(num_rows)]) + + +def generate_bytes_series(num_rows: int, n_bytes: int = 10) -> pd.Series: + return pd.Series([np.random.bytes(n_bytes) for _ in range(num_rows)]) + + +def generate_ipv4_series(num_rows: int) -> pd.Series: + def random_ipv4(): + address_str = ".".join(str(random.randint(0, 255)) for _ in range(4)) + return ipaddress.ip_address(address_str) + + return pd.Series([random_ipv4() for _ in range(num_rows)]) + + +def generate_ipv6_series(num_rows: int) -> pd.Series: + def random_ipv6(): + address_str = ":".join( + str(hex(random.randint(0, 65_535))).replace("0x", "") for _ in range(8) + ) + return ipaddress.ip_address(address_str) + + return pd.Series([random_ipv6() for _ in range(num_rows)]) + + +### Handler helper functions ### +def handle_dict_series(s: pd.Series) -> pd.Series: + types = dict + if any(isinstance(v, types) for v in s.dropna().head().values): + logger.debug(f"series `{s.name}` has dicts; converting to json string") + s = s.apply(lambda x: json.dumps(x) if isinstance(x, types) else x) + return s + + +def handle_dtype_series(s: pd.Series): + """ + Casts dtypes as strings. + """ + types = (type, np.dtype) + if any(isinstance(v, types) for v in s.dropna().head().values): + logger.debug(f"series `{s.name}` has types; converting to strings") + s = s.astype(str) + return s + + +def handle_interval_series(s: pd.Series) -> pd.Series: + types = pd.Interval + if any(isinstance(v, types) for v in s.dropna().head().values): + logger.debug(f"series `{s.name}` has intervals; converting to left/right") + s = s.apply(lambda x: [x.left, x.right] if isinstance(x, types) else x) + return s + + +def handle_ip_address_series(s: pd.Series) -> pd.Series: + types = (ipaddress.IPv4Address, ipaddress.IPv6Address) + if any(isinstance(v, types) for v in s.dropna().head().values): + logger.debug(f"series `{s.name}` has ip addresses; converting to strings") + s = s.astype(str) + return s + + +def handle_sequence_series(s: pd.Series) -> pd.Series: + types = (list, tuple, set, np.ndarray) + if is_sequence_series(s): + logger.debug(f"series `{s.name}` has sequences; converting to comma-separated string") + s = s.apply(lambda x: ", ".join([str(val) for val in x] if isinstance(x, types) else x)) + return s + + +def handle_unk_type_series(s: pd.Series) -> pd.Series: + if not is_json_serializable(s): + logger.debug(f"series `{s.name}` has non-JSON-serializable types; converting to string") + s = s.astype(str) + return s + + +### Type checking helper functions ### +def is_sequence_series(s: pd.Series) -> bool: + """ + Returns True if the series has any list/tuple/set/array values. + """ + if str(s.dtype) != "object": + return False + + if any(isinstance(v, (list, tuple, set, np.ndarray)) for v in s.dropna().head().values): + return True + return False + + +def is_json_serializable(s: pd.Series) -> bool: + """ + Returns True if the object can be serialized to JSON. + """ + try: + _ = json.dumps(s.dropna().head().values.tolist()) + return True + except (TypeError, OverflowError, UnicodeDecodeError): + # these are the main serialization errors we expect + return False + except ValueError as ve: + # ...but we may get here if we have a series with duplicate index values + # "ValueError: Series index must be unique for orient='index'" + logger.debug(ve) + return False diff --git a/src/dx/datatypes/numeric.py b/src/dx/datatypes/numeric.py new file mode 100644 index 00000000..788d73c1 --- /dev/null +++ b/src/dx/datatypes/numeric.py @@ -0,0 +1,43 @@ +from decimal import Decimal + +import numpy as np +import pandas as pd +import structlog + +logger = structlog.get_logger(__name__) + + +### Generator helper functions ### +def generate_integer_series(num_rows: int) -> pd.Series: + return pd.Series([np.random.randint(-100, 100) for _ in range(num_rows)]) + + +def generate_float_series(num_rows: int) -> pd.Series: + return pd.Series([np.random.rand() for _ in range(num_rows)]) + + +def generate_decimal_series(num_rows: int) -> pd.Series: + return pd.Series([Decimal(np.random.rand()) for _ in range(num_rows)]) + + +def generate_complex_number_series(num_rows: int) -> pd.Series: + return pd.Series( + [complex(real=np.random.rand(), imag=np.random.rand()) for _ in range(num_rows)] + ) + + +### Handler helper functions ### +def handle_complex_number_series(s: pd.Series) -> pd.Series: + types = (complex, np.complex) + if any(isinstance(v, types) for v in s.dropna().head().values): + logger.debug(f"series `{s.name}` has complex numbers; converting to real/imag string") + s = s.apply(lambda x: f"{x.real}+{x.imag}j" if isinstance(x, types) else x) + return s + + +def handle_decimal_series(s: pd.Series) -> pd.Series: + types = (Decimal,) + if any(isinstance(v, types) for v in s.dropna().head().values): + logger.debug(f"series `{s.name}` has Decimals; converting to float") + s = s.astype(float) + return s diff --git a/src/dx/datatypes/text.py b/src/dx/datatypes/text.py new file mode 100644 index 00000000..c40271fe --- /dev/null +++ b/src/dx/datatypes/text.py @@ -0,0 +1,31 @@ +import random +import string + +import numpy as np +import pandas as pd +import structlog + +try: + from faker import Faker + + fake = Faker() + FAKER_INSTALLED = True +except ImportError: + FAKER_INSTALLED = False + + +logger = structlog.get_logger(__name__) + + +def generate_text_series(num_rows: int) -> pd.Series: + if not FAKER_INSTALLED: + logger.warning("faker is not installed, skipping text_column") + return np.nan + + return pd.Series([fake.text() for _ in range(num_rows)]) + + +def generate_keyword_series(num_rows: int, num_letters: int = 2) -> pd.Series: + return pd.Series( + ["".join(random.sample(string.ascii_uppercase, num_letters)) for _ in range(num_rows)] + ) diff --git a/src/dx/formatters/main.py b/src/dx/formatters/main.py index 20078265..4aa26d36 100644 --- a/src/dx/formatters/main.py +++ b/src/dx/formatters/main.py @@ -12,8 +12,12 @@ from dx.sampling import get_df_dimensions, sample_if_too_big from dx.settings import settings from dx.types import DXDisplayMode -from dx.utils.datatypes import to_dataframe -from dx.utils.formatting import generate_metadata, is_default_index, normalize_index_and_columns +from dx.utils.formatting import ( + generate_metadata, + is_default_index, + normalize_index_and_columns, + to_dataframe, +) from dx.utils.tracking import DXDF_CACHE, SUBSET_TO_DISPLAY_ID, DXDataFrame, get_db_connection logger = structlog.get_logger(__name__) @@ -36,6 +40,7 @@ def datalink_processing( df: pd.DataFrame, default_index_used: bool, ipython_shell: Optional[InteractiveShell] = None, + with_ipython_display: bool = True, ): dxdf = DXDataFrame(df) logger.debug(f"{dxdf=}") @@ -52,6 +57,7 @@ def datalink_processing( update=parent_display_id, display_id=dxdf.display_id, has_default_index=default_index_used, + with_ipython_display=with_ipython_display, ) # this needs to happen after sending to the frontend @@ -65,6 +71,7 @@ def datalink_processing( def handle_format( obj, + with_ipython_display: bool = True, ipython_shell: Optional[InteractiveShell] = None, ): ipython = ipython_shell or get_ipython() @@ -81,6 +88,7 @@ def handle_format( payload, metadata = format_output( obj, has_default_index=default_index_used, + with_ipython_display=with_ipython_display, ) return payload, metadata @@ -89,11 +97,16 @@ def handle_format( obj, default_index_used, ipython_shell=ipython, + with_ipython_display=with_ipython_display, ) except Exception as e: logger.debug(f"Error in datalink_processing: {e}") # fall back to default processing - payload, metadata = format_output(obj, has_default_index=default_index_used) + payload, metadata = format_output( + obj, + has_default_index=default_index_used, + with_ipython_display=with_ipython_display, + ) return payload, metadata @@ -148,6 +161,7 @@ def format_output( update: bool = False, display_id: Optional[str] = None, has_default_index: bool = True, + with_ipython_display: bool = True, ) -> tuple: display_id = display_id or str(uuid.uuid4()) @@ -169,14 +183,15 @@ def format_output( metadata = {settings.MEDIA_TYPE: metadata} # this needs to happen so we can update by display_id as needed - with pd.option_context("html.table_schema", settings.HTML_TABLE_SCHEMA): - logger.debug(f"displaying {settings.MEDIA_TYPE} payload in {display_id=}") - ipydisplay( - payload, - raw=True, - metadata=metadata, - display_id=display_id, - update=update, - ) + if with_ipython_display: + with pd.option_context("html.table_schema", settings.HTML_TABLE_SCHEMA): + logger.debug(f"displaying {settings.MEDIA_TYPE} payload in {display_id=}") + ipydisplay( + payload, + raw=True, + metadata=metadata, + display_id=display_id, + update=update, + ) return (payload, metadata) diff --git a/src/dx/utils/__init__.py b/src/dx/utils/__init__.py index 8123c46a..aa200cde 100644 --- a/src/dx/utils/__init__.py +++ b/src/dx/utils/__init__.py @@ -1,5 +1,2 @@ -from .datatypes import * -from .date_time import * from .formatting import * -from .geometry import * from .tracking import * diff --git a/src/dx/utils/datatypes.py b/src/dx/utils/datatypes.py deleted file mode 100644 index 7cfaafbc..00000000 --- a/src/dx/utils/datatypes.py +++ /dev/null @@ -1,387 +0,0 @@ -import ipaddress -import json -import random -import string - -import numpy as np -import pandas as pd -import structlog - -from dx.utils import date_time, geometry - -try: - from faker import Faker - - fake = Faker() - FAKER_INSTALLED = True -except ImportError: - FAKER_INSTALLED = False - - -logger = structlog.get_logger(__name__) - - -DX_DATATYPES = { - "dtype_column": True, - "integer_column": True, - "float_column": True, - "datetime_column": True, - "time_delta_column": False, - "time_period_column": False, - "time_interval_column": False, - "text_column": False, - "keyword_column": True, - "dict_column": False, - "list_column": False, - "nested_tabular_column": False, - "latlon_point_column": False, - "filled_geojson_column": False, - "exterior_geojson_column": False, - "bytes_column": True, - "ipv4_address_column": False, - "ipv6_address_column": False, - "complex_number_column": False, -} -SORTED_DX_DATATYPES = sorted(list(DX_DATATYPES.keys())) - - -def generate_integer_series(num_rows: int) -> pd.Series: - return pd.Series([np.random.randint(-100, 100) for _ in range(num_rows)]) - - -def generate_float_series(num_rows: int) -> pd.Series: - return pd.Series([np.random.rand() for _ in range(num_rows)]) - - -def generate_complex_number_series(num_rows: int) -> pd.Series: - return pd.Series( - [complex(real=np.random.rand(), imag=np.random.rand()) for _ in range(num_rows)] - ) - - -def generate_dtype_series(num_rows: int) -> pd.Series: - return pd.Series( - [random.choice([float, int, str, bool, set, tuple, dict, list]) for _ in range(num_rows)] - ) - - -def generate_text_series(num_rows: int) -> pd.Series: - if not FAKER_INSTALLED: - logger.warning("faker is not installed, skipping text_column") - return np.nan - - return pd.Series([fake.text() for _ in range(num_rows)]) - - -def generate_keyword_series(num_rows: int, num_letters: int = 2) -> pd.Series: - return pd.Series( - ["".join(random.sample(string.ascii_uppercase, num_letters)) for _ in range(num_rows)] - ) - - -def generate_dict_series(num_rows: int) -> pd.Series: - return pd.Series( - [ - { - "nested_property": random.choice(["apple", "banana", "orange", "pear"]), - "nested_other_property": random.randint(0, 10), - "nested_bool": random.choice([True, False]), - } - for _ in range(num_rows) - ] - ) - - -def generate_list_series(num_rows: int) -> pd.Series: - return pd.Series([[random.randint(0, 5) for _ in range(5)] for _ in range(num_rows)]) - - -def generate_bytes_series(num_rows: int, n_bytes: int = 10) -> pd.Series: - return pd.Series([np.random.bytes(n_bytes) for _ in range(num_rows)]) - - -def generate_nested_tabular_series(num_rows: int, num_nested_rows: int = 5, **kwargs) -> pd.Series: - return pd.Series( - [ - random_dataframe(num_rows=num_nested_rows, **kwargs).to_dict("records") - for _ in range(num_rows) - ] - ) - - -def generate_ipv4_series(num_rows: int) -> pd.Series: - def random_ipv4(): - address_str = ".".join(str(random.randint(0, 255)) for _ in range(4)) - return ipaddress.ip_address(address_str) - - return pd.Series([random_ipv4() for _ in range(num_rows)]) - - -def generate_ipv6_series(num_rows: int) -> pd.Series: - def random_ipv6(): - address_str = ":".join( - str(hex(random.randint(0, 65_535))).replace("0x", "") for _ in range(8) - ) - return ipaddress.ip_address(address_str) - - return pd.Series([random_ipv6() for _ in range(num_rows)]) - - -def handle_complex_number_series(s: pd.Series) -> pd.Series: - types = (complex, np.complex) - if any(isinstance(v, types) for v in s.dropna().head().values): - logger.debug(f"series `{s.name}` has complex numbers; converting to real/imag string") - s = s.apply(lambda x: f"{x.real}+{x.imag}j" if isinstance(x, types) else x) - return s - - -def handle_dict_series(s: pd.Series) -> pd.Series: - types = dict - if any(isinstance(v, types) for v in s.dropna().head().values): - logger.debug(f"series `{s.name}` has dicts; converting to json string") - s = s.apply(lambda x: json.dumps(x) if isinstance(x, types) else x) - return s - - -def handle_dtype_series(s: pd.Series): - """ - Casts dtypes as strings. - """ - types = (type, np.dtype) - if any(isinstance(v, types) for v in s.dropna().head().values): - logger.debug(f"series `{s.name}` has types; converting to strings") - s = s.astype(str) - return s - - -def handle_interval_series(s: pd.Series) -> pd.Series: - types = pd.Interval - if any(isinstance(v, types) for v in s.dropna().head().values): - logger.debug(f"series `{s.name}` has intervals; converting to left/right") - s = s.apply(lambda x: [x.left, x.right] if isinstance(x, types) else x) - return s - - -def handle_ip_address_series(s: pd.Series) -> pd.Series: - types = (ipaddress.IPv4Address, ipaddress.IPv6Address) - if any(isinstance(v, types) for v in s.dropna().head().values): - logger.debug(f"series `{s.name}` has ip addresses; converting to strings") - s = s.astype(str) - return s - - -def handle_sequence_series(s: pd.Series) -> pd.Series: - types = (list, tuple, set, np.ndarray) - if is_sequence_series(s): - logger.debug(f"series `{s.name}` has sequences; converting to comma-separated string") - s = s.apply(lambda x: ", ".join([str(val) for val in x] if isinstance(x, types) else x)) - return s - - -def is_sequence_series(s: pd.Series) -> bool: - """ - Returns True if the series has any list/tuple/set/array values. - """ - if str(s.dtype) != "object": - return False - - if any(isinstance(v, (list, tuple, set, np.ndarray)) for v in s.dropna().head().values): - return True - return False - - -def handle_unk_type_series(s: pd.Series) -> pd.Series: - if not is_json_serializable(s): - logger.debug(f"series `{s.name}` has non-JSON-serializable types; converting to string") - s = s.astype(str) - return s - - -def is_json_serializable(s: pd.Series) -> bool: - """ - Returns True if the object can be serialized to JSON. - """ - try: - _ = json.dumps(s.dropna().head().values.tolist()) - return True - except (TypeError, OverflowError, UnicodeDecodeError): - # these are the main serialization errors we expect - return False - except ValueError as ve: - # ...but we may get here if we have a series with duplicate index values - # "ValueError: Series index must be unique for orient='index'" - logger.debug(ve) - return False - - -def has_numeric_strings(s: pd.Series) -> bool: - if not str(s.dtype) == "object": - return False - for v in s.dropna().head().values: - if str(v).isnumeric() or str(v).isdigit() or str(v).isdecimal(): - return True - return False - - -def quick_random_dataframe( - num_rows: int = 5, - num_cols: int = 2, - dtype: str = "float", - factor: float = 1.0, -) -> pd.DataFrame: - """ - Convenience function wrapping `pd.DataFrame(np.random.rand( num_rows, num_columns ))` - to create a dataframe of random 0.0-1.0 values. - """ - data = np.random.rand(num_rows, num_cols) * factor - df = pd.DataFrame(data) - return df.astype(dtype, errors="ignore") - - -def random_dataframe(num_rows: int = 5, **kwargs): # noqa: C901 - - kwargs = kwargs or DX_DATATYPES - df = pd.DataFrame(index=list(range(num_rows))) - - if kwargs.get("dtype_column"): - df["dtype_column"] = generate_dtype_series(num_rows) - - # numeric columns - if kwargs.get("integer_column"): - df["integer_column"] = generate_integer_series(num_rows) - - if kwargs.get("float_column"): - df["float_column"] = generate_float_series(num_rows) - - if kwargs.get("complex_number_column"): - df["complex_number_column"] = generate_complex_number_series(num_rows) - - # date/time columns - if kwargs.get("datetime_column"): - df["datetime_column"] = date_time.generate_datetime_series(num_rows) - - if kwargs.get("time_delta_column"): - df["time_delta_column"] = date_time.generate_time_delta_series(num_rows) - - if kwargs.get("time_period_column"): - df["time_period_column"] = date_time.generate_time_period_series(num_rows) - - if kwargs.get("time_interval_column"): - df["time_interval_column"] = date_time.generate_time_interval_series(num_rows) - - # string columns - if kwargs.get("text_column"): - df["text_column"] = generate_text_series(num_rows) - - if kwargs.get("keyword_column"): - df["keyword_column"] = generate_keyword_series(num_rows) - - # container columns - if kwargs.get("dict_column"): - df["dict_column"] = generate_dict_series(num_rows) - - if kwargs.get("list_column"): - df["list_column"] = generate_list_series(num_rows) - - if kwargs.get("nested_tabular_column"): - df["nested_tabular_column"] = generate_nested_tabular_series( - num_rows, - float_column=True, - keyword_column=True, - ) - - # geopandas/shapely columns - if kwargs.get("latlon_point_column"): - df["latlon_point_column"] = geometry.generate_latlon_series(num_rows) - - if kwargs.get("filled_geojson_column"): - df["filled_geojson_column"] = geometry.generate_filled_geojson_series(num_rows) - - if kwargs.get("exterior_geojson_column"): - df["exterior_geojson_column"] = geometry.generate_exterior_bounds_geojson_series(num_rows) - - # extras - if kwargs.get("bytes_column"): - df["bytes_column"] = generate_bytes_series(num_rows) - - if kwargs.get("ipv4_address_column"): - df["ipv4_address_column"] = generate_ipv4_series(num_rows) - - if kwargs.get("ipv6_address_column"): - df["ipv6_address_column"] = generate_ipv6_series(num_rows) - - return df - - -def to_dataframe(obj) -> pd.DataFrame: - """ - Converts an object to a pandas dataframe. - """ - logger.debug(f"converting {type(obj)} to pd.DataFrame") - - # handling for groupby operations returning pd.Series - index_reset_name = None - if is_groupby_series(obj): - orig_index_names = obj.index.names - index_reset_name = groupby_series_index_name(obj.index) - # this will convert a MultiIndex series to a flat DataFrame - obj = obj.reset_index(name=index_reset_name) - # ensure we keep the original index structure - obj.set_index(orig_index_names, inplace=True) - - df = pd.DataFrame(obj) - return df - - -def is_groupby_series(s: pd.Series) -> bool: - """ - Checks if the pd.Series is the result of a groupby operation - by checking if the index is a MultiIndex and its name is - also used as a level in its index. - - Example: - - df = pd.DataFrame({ - 'foo': list('aaabbcddee'), - 'bar': np.random.rand(1, 10)[0], - 'baz': np.random.randint(-10, 10, 10) - }) - - group = df.groupby('foo').bar.value_counts() - print(group) - >>> foo bar - a 0.304653 1 - 0.440604 1 - 0.445702 1 - b 0.164294 1 - 0.296721 1 - c 0.789996 1 - d 0.550120 1 - 0.948220 1 - e 0.223248 1 - 0.664756 1 - Name: bar, dtype: int64 - - print(group.index.names) - >>> ['foo', 'bar'] - - print(group.name) - >>> bar - """ - if not isinstance(s, pd.Series): - return False - if not isinstance(s.index, pd.MultiIndex): - return False - return s.name in s.index.names - - -def groupby_series_index_name(index: pd.MultiIndex) -> str: - """ - Creates a name for groupby operations to provide using a .reset_index() - based on the dataframe's MultiIndex names. - - Example: - - A MultiIndex with level names of ["foo", "bar"] will return "foo.bar.value" - """ - index_trail = ".".join([str(name) for name in index.names]) - return f"{index_trail}.value" diff --git a/src/dx/utils/formatting.py b/src/dx/utils/formatting.py index 29ced5dc..c17a74d3 100644 --- a/src/dx/utils/formatting.py +++ b/src/dx/utils/formatting.py @@ -1,12 +1,86 @@ import pandas as pd import structlog +from dx.datatypes import date_time, geometry, misc, numeric from dx.settings import settings -from dx.utils import datatypes, date_time, geometry logger = structlog.get_logger(__name__) +def to_dataframe(obj) -> pd.DataFrame: + """ + Converts an object to a pandas dataframe. + """ + logger.debug(f"converting {type(obj)} to pd.DataFrame") + + # handling for groupby operations returning pd.Series + index_reset_name = None + if is_groupby_series(obj): + orig_index_names = obj.index.names + index_reset_name = groupby_series_index_name(obj.index) + # this will convert a MultiIndex series to a flat DataFrame + obj = obj.reset_index(name=index_reset_name) + # ensure we keep the original index structure + obj.set_index(orig_index_names, inplace=True) + + df = pd.DataFrame(obj) + return df + + +def is_groupby_series(s: pd.Series) -> bool: + """ + Checks if the pd.Series is the result of a groupby operation + by checking if the index is a MultiIndex and its name is + also used as a level in its index. + + Example: + + df = pd.DataFrame({ + 'foo': list('aaabbcddee'), + 'bar': np.random.rand(1, 10)[0], + 'baz': np.random.randint(-10, 10, 10) + }) + + group = df.groupby('foo').bar.value_counts() + print(group) + >>> foo bar + a 0.304653 1 + 0.440604 1 + 0.445702 1 + b 0.164294 1 + 0.296721 1 + c 0.789996 1 + d 0.550120 1 + 0.948220 1 + e 0.223248 1 + 0.664756 1 + Name: bar, dtype: int64 + + print(group.index.names) + >>> ['foo', 'bar'] + + print(group.name) + >>> bar + """ + if not isinstance(s, pd.Series): + return False + if not isinstance(s.index, pd.MultiIndex): + return False + return s.name in s.index.names + + +def groupby_series_index_name(index: pd.MultiIndex) -> str: + """ + Creates a name for groupby operations to provide using a .reset_index() + based on the dataframe's MultiIndex names. + + Example: + - A MultiIndex with level names of ["foo", "bar"] will return "foo.bar.value" + """ + index_trail = ".".join([str(name) for name in index.names]) + return f"{index_trail}.value" + + def is_default_index(index: pd.Index) -> bool: """ Returns True if the index have no specified name, @@ -117,17 +191,20 @@ def clean_column_values(s: pd.Series) -> pd.Series: """ s = date_time.handle_time_period_series(s) s = date_time.handle_time_delta_series(s) + s = date_time.handle_date_series(s) + + s = numeric.handle_decimal_series(s) + s = numeric.handle_complex_number_series(s) - s = datatypes.handle_dtype_series(s) - s = datatypes.handle_interval_series(s) - s = datatypes.handle_ip_address_series(s) - s = datatypes.handle_complex_number_series(s) + s = misc.handle_dtype_series(s) + s = misc.handle_interval_series(s) + s = misc.handle_ip_address_series(s) s = geometry.handle_geometry_series(s) - s = datatypes.handle_dict_series(s) - s = datatypes.handle_sequence_series(s) - s = datatypes.handle_unk_type_series(s) + s = misc.handle_dict_series(s) + s = misc.handle_sequence_series(s) + s = misc.handle_unk_type_series(s) return s diff --git a/tests/conftest.py b/tests/conftest.py index 205c0678..b4c134b0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,9 +5,9 @@ from IPython.terminal.interactiveshell import TerminalInteractiveShell from IPython.testing import tools +from dx.datatypes.main import random_dataframe from dx.settings import get_settings from dx.types import DEXFilterSettings -from dx.utils.datatypes import random_dataframe from dx.utils.formatting import normalize_index_and_columns from dx.utils.tracking import DXDataFrame diff --git a/tests/test_dataresource.py b/tests/test_dataresource.py index b4ce0ed8..e1bd1249 100644 --- a/tests/test_dataresource.py +++ b/tests/test_dataresource.py @@ -2,10 +2,10 @@ import pytest +from dx.datatypes.main import quick_random_dataframe from dx.formatters.main import format_output, generate_body from dx.formatters.simple import get_dataresource_settings from dx.settings import settings_context -from dx.utils.datatypes import quick_random_dataframe dataresource_settings = get_dataresource_settings() diff --git a/tests/test_datatype_handling.py b/tests/test_datatype_handling.py index adbd29be..5e070ef3 100644 --- a/tests/test_datatype_handling.py +++ b/tests/test_datatype_handling.py @@ -1,6 +1,6 @@ import pytest -from dx.utils.datatypes import SORTED_DX_DATATYPES, random_dataframe +from dx.datatypes.main import SORTED_DX_DATATYPES, random_dataframe from dx.utils.formatting import clean_column_values diff --git a/tests/test_datatypes.py b/tests/test_datatypes.py index e235edb3..cd15cbde 100644 --- a/tests/test_datatypes.py +++ b/tests/test_datatypes.py @@ -4,55 +4,28 @@ - hash the dataframe for tracking - write to the database for tracking/filtering """ +from datetime import datetime import duckdb +import numpy as np import pandas as pd import pytest from pandas.io.json import build_table_schema from pandas.util import hash_pandas_object -from dx.formatters.main import generate_body -from dx.settings import settings_context -from dx.utils.datatypes import ( +from dx.datatypes import date_time, geometry, main, misc, numeric, text +from dx.datatypes.main import ( DX_DATATYPES, SORTED_DX_DATATYPES, - groupby_series_index_name, quick_random_dataframe, random_dataframe, - to_dataframe, ) +from dx.formatters.main import generate_body +from dx.settings import settings_context from dx.utils.formatting import clean_column_values from dx.utils.tracking import generate_df_hash -@pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES) -def test_df_generator(dtype: str, num_rows: int = 5): - params = {dt: False for dt in SORTED_DX_DATATYPES} - params[dtype] = True - df = random_dataframe(num_rows=num_rows, **params) - assert len(df) == num_rows - assert isinstance(df[dtype], pd.Series) - assert df[dtype].notnull().all() - - -def test_random_dataframe_has_default_data(num_rows: int = 5): - df = random_dataframe(num_rows=num_rows) - assert len(df) == num_rows - default_enabled_columns = [column for column, enabled in DX_DATATYPES.items() if enabled] - assert len(df.columns) == len(default_enabled_columns) - for col in default_enabled_columns: - assert col in df.columns - assert df[col].notnull().all() - - -def test_quick_random_dataframe_has_default_data(): - df = quick_random_dataframe() - assert df.shape[0] >= 1 - assert df.shape[1] >= 1 - for col in df.columns: - assert df[col].notnull().all() - - @pytest.mark.xfail(reason="only for dev") @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES) def test_data_types_with_build_table_schema(dtype: str): @@ -69,8 +42,9 @@ def test_data_types_with_build_table_schema(dtype: str): assert isinstance(schema, dict) +@pytest.mark.parametrize("display_mode", ["simple", "enhanced"]) @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES) -def test_generate_simple_body(dtype: str): +def test_generate_body(dtype: str, display_mode: str): """ Test that we've correctly handled data types before building the schema and metadata for the DXDisplayFormatter. @@ -79,24 +53,7 @@ def test_generate_simple_body(dtype: str): params[dtype] = True df = random_dataframe(**params) try: - with settings_context(display_mode="simple"): - payload = generate_body(df) - except Exception as e: - assert False, f"{dtype} failed with {e}" - assert isinstance(payload, dict) - - -@pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES) -def test_generate_enhanced_body(dtype: str): - """ - Test that we've correctly handled data types before building the schema and metadata for - the DXDisplayFormatter. - """ - params = {dt: False for dt in SORTED_DX_DATATYPES} - params[dtype] = True - df = random_dataframe(**params) - try: - with settings_context(display_mode="enhanced"): + with settings_context(display_mode=display_mode): payload = generate_body(df) except Exception as e: assert False, f"{dtype} failed with {e}" @@ -140,8 +97,7 @@ def test_generate_df_hash(dtype: str): @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES) def test_to_sql(dtype: str, sample_db_connection: duckdb.DuckDBPyConnection): """ - DEV: Test which data types pass/fail when passed directly through .to_sql() - with the sqlalchemy engine. + DEV: Test which data types pass/fail when registered directly to duckdb. """ params = {dt: False for dt in SORTED_DX_DATATYPES} params[dtype] = True @@ -160,7 +116,7 @@ def test_to_sql(dtype: str, sample_db_connection: duckdb.DuckDBPyConnection): @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES) def test_store_in_db(dtype: str, sample_db_connection: duckdb.DuckDBPyConnection): """ - Test that we've correctly handled data types before storing in sqlite. + Test that we've correctly handled data types before storing in duckdb. """ params = {dt: False for dt in SORTED_DX_DATATYPES} params[dtype] = True @@ -179,55 +135,224 @@ def test_store_in_db(dtype: str, sample_db_connection: duckdb.DuckDBPyConnection assert num_rows == df.shape[0] -def test_series_is_converted(sample_random_dataframe: pd.Series): - """ - Test that a basic conversion from pd.Series to pd.Dataframe - keeps the original index and uses the Series name as its only column. - """ - s: pd.Series = sample_random_dataframe.keyword_column - df = to_dataframe(s) - assert df.index.equals(s.index) - assert df.columns[0] == s.name - - -def test_multiindex_series_left_alone(sample_multiindex_series: pd.Series): - """ - Test no renaming is done with a MultiIndex pd.Series if their - name doesn't appear in the MultiIndex names. - """ - index = sample_multiindex_series.index - df = to_dataframe(sample_multiindex_series) - assert df.index.names == index.names - assert df.columns[0] == sample_multiindex_series.name - - -def test_groupby_series_resets(sample_groupby_series: pd.Series): - """ - Test we're resetting the index of a pd.Series created from a groupby - operation by using the combination of index names. - """ - index = sample_groupby_series.index - df = to_dataframe(sample_groupby_series) - assert df.index.names == index.names - assert df.columns[0] == groupby_series_index_name(index) - assert df.columns[0] != sample_groupby_series.name - - -def test_dataframe_index_left_alone(sample_random_dataframe: pd.DataFrame): - """ - Ensure we don't alter the structure of a dataframe during - initial dataframe conversion. - """ - df = to_dataframe(sample_random_dataframe) - assert df.index.equals(sample_random_dataframe.index) - assert df.columns.equals(sample_random_dataframe.columns) - - -def test_groupby_dataframe_index_left_alone(sample_groupby_dataframe: pd.DataFrame): - """ - Ensure we don't alter the structure of a dataframe - with MultiIndexes during initial dataframe conversion. - """ - df = to_dataframe(sample_groupby_dataframe) - assert df.index.equals(sample_groupby_dataframe.index) - assert df.columns.equals(sample_groupby_dataframe.columns) +class TestDataFrameGeneration: + """Basic testing to make sure our dataframe generation provides data with default arguments.""" + + @pytest.mark.parametrize("dtype", SORTED_DX_DATATYPES) + def test_df_generator(self, dtype: str, num_rows: int = 5): + params = {dt: False for dt in SORTED_DX_DATATYPES} + params[dtype] = True + df = random_dataframe(num_rows=num_rows, **params) + assert len(df) == num_rows + assert isinstance(df[dtype], pd.Series) + assert df[dtype].notnull().all() + + def test_random_dataframe_has_default_data(self, num_rows: int = 5): + df = random_dataframe(num_rows=num_rows) + assert len(df) == num_rows + default_enabled_columns = [column for column, enabled in DX_DATATYPES.items() if enabled] + assert len(df.columns) == len(default_enabled_columns) + for col in default_enabled_columns: + # if this fails, that means something was added to DX_DATATYPES that doesn't match + # the default arguments of random_dataframe() + assert col in df.columns + assert df[col].notnull().all() + + def test_quick_random_dataframe_has_default_data(self): + df = quick_random_dataframe() + assert df.shape[0] >= 1 + assert df.shape[1] >= 1 + for col in df.columns: + assert df[col].notnull().all() + + +class TestDatatypeHandling: + def test_integer_series_left_alone(self): + series = numeric.generate_integer_series(5) + series = clean_column_values(series) + assert series.dtype == "int64" + assert isinstance( + series.values[0], (int, np.int64) + ), f"cleaned series value is {type(series.values[0])}" + + def test_float_series_left_alone(self): + series = numeric.generate_float_series(5) + series = clean_column_values(series) + assert series.dtype == "float64" + assert isinstance( + series.values[0], (float, np.float64) + ), f"cleaned series value is {type(series.values[0])}" + + def test_boolean_series_left_alone(self): + series = misc.generate_boolean_series(5) + series = clean_column_values(series) + assert series.dtype == "bool" + assert isinstance( + series.values[0], (bool, np.bool_) + ), f"cleaned series value is {type(series.values[0])}" + + def test_dtype_series_converted(self): + series = misc.generate_dtype_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_decimal_series_converted(self): + series = numeric.generate_decimal_series(5) + series = clean_column_values(series) + assert series.dtype == "float64" + assert isinstance( + series.values[0], (float, np.float64) + ), f"cleaned series value is {type(series.values[0])}" + + def test_datetime_series_left_alone(self): + series = date_time.generate_datetime_series(5) + series = clean_column_values(series) + assert series.dtype == "datetime64[ns]" + assert isinstance( + series.values[0], (datetime, np.datetime64) + ), f"cleaned series value is {type(series.values[0])}" + + def test_date_series_converted(self): + # datetime.date values are converted to pd.Timestamp + series = date_time.generate_date_series(5) + series = clean_column_values(series) + assert series.dtype == "datetime64[ns]" + assert isinstance( + series.values[0], (datetime, np.datetime64) + ), f"cleaned series value is {type(series.values[0])}" + + def test_time_series_converted(self): + # datetime.time values are converted to strings + series = date_time.generate_time_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_timedelta_series_converted(self): + # time delta values are converted to floats (total seconds) + series = date_time.generate_time_delta_series(5) + series = clean_column_values(series) + assert series.dtype == "float64" + assert isinstance( + series.values[0], (float, np.float64) + ), f"cleaned series value is {type(series.values[0])}" + + def test_time_period_series_converted(self): + series = date_time.generate_time_period_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_time_interval_series_converted(self): + series = date_time.generate_time_interval_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_text_series_left_alone(self): + series = text.generate_text_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_keyword_series_left_alone(self): + series = text.generate_keyword_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_dict_series_converted(self): + # dictionary values are JSON-stringifed + series = misc.generate_dict_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_list_series_converted(self): + # sequence values are cast as strings + series = misc.generate_list_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_nested_tabular_series_converted(self): + # lists of dictionaries are JSON-stringified + series = main.generate_nested_tabular_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_latlon_point_series_converted(self): + # latlon point values are converted to GeoJSON strings + series = geometry.generate_latlon_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_filled_geojson_series_converted(self): + # shapely.geometry values are converted to GeoJSON strings + # by handle_geometry_series() + series = geometry.generate_filled_geojson_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_exterior_bounds_geojson_series_converted(self): + # shapely.geometry exterior values are converted to GeoJSON strings + # by handle_geometry_series() + series = geometry.generate_exterior_bounds_geojson_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_bytes_series_converted(self): + # bytes values are converted to strings + series = misc.generate_bytes_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_ipv4_address_series_converted(self): + # IPv4Address values are converted to strings + series = misc.generate_ipv4_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" + + def test_ipv6_address_series_converted(self): + # IPv6Address values are converted to strings + series = misc.generate_ipv6_series(5) + series = clean_column_values(series) + assert series.dtype == "object" + assert isinstance( + series.values[0], str + ), f"cleaned series value is {type(series.values[0])}" diff --git a/tests/test_dx.py b/tests/test_dx.py index 59571bed..d45044a5 100644 --- a/tests/test_dx.py +++ b/tests/test_dx.py @@ -2,10 +2,10 @@ import pytest +from dx.datatypes.main import quick_random_dataframe from dx.formatters.enhanced import get_dx_settings from dx.formatters.main import format_output, generate_body from dx.settings import settings_context -from dx.utils.datatypes import quick_random_dataframe dx_settings = get_dx_settings() diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 78ae441f..1979ca31 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -7,7 +7,7 @@ from dx.formatters.main import DXDisplayFormatter, generate_body, handle_format from dx.formatters.simple import get_dataresource_settings from dx.settings import get_settings, settings_context -from dx.utils.formatting import normalize_index_and_columns +from dx.utils.formatting import groupby_series_index_name, normalize_index_and_columns, to_dataframe from dx.utils.tracking import DXDF_CACHE dataresource_settings = get_dataresource_settings() @@ -365,3 +365,54 @@ def test_sample_resampled_multi_groupby_dataframe(self, sample_random_dataframe: assert "keyword_column.value" in clean_df.columns assert "integer_column" not in clean_df.columns assert "integer_column.value" in clean_df.columns + + +class TestDataFrameConversion: + def test_series_is_converted(self, sample_random_dataframe: pd.Series): + """ + Test that a basic conversion from pd.Series to pd.Dataframe + keeps the original index and uses the Series name as its only column. + """ + s: pd.Series = sample_random_dataframe.keyword_column + df = to_dataframe(s) + assert df.index.equals(s.index) + assert df.columns[0] == s.name + + def test_multiindex_series_left_alone(self, sample_multiindex_series: pd.Series): + """ + Test no renaming is done with a MultiIndex pd.Series if their + name doesn't appear in the MultiIndex names. + """ + index = sample_multiindex_series.index + df = to_dataframe(sample_multiindex_series) + assert df.index.names == index.names + assert df.columns[0] == sample_multiindex_series.name + + def test_groupby_series_resets(self, sample_groupby_series: pd.Series): + """ + Test we're resetting the index of a pd.Series created from a groupby + operation by using the combination of index names. + """ + index = sample_groupby_series.index + df = to_dataframe(sample_groupby_series) + assert df.index.names == index.names + assert df.columns[0] == groupby_series_index_name(index) + assert df.columns[0] != sample_groupby_series.name + + def test_dataframe_index_left_alone(self, sample_random_dataframe: pd.DataFrame): + """ + Ensure we don't alter the structure of a dataframe during + initial dataframe conversion. + """ + df = to_dataframe(sample_random_dataframe) + assert df.index.equals(sample_random_dataframe.index) + assert df.columns.equals(sample_random_dataframe.columns) + + def test_groupby_dataframe_index_left_alone(self, sample_groupby_dataframe: pd.DataFrame): + """ + Ensure we don't alter the structure of a dataframe + with MultiIndexes during initial dataframe conversion. + """ + df = to_dataframe(sample_groupby_dataframe) + assert df.index.equals(sample_groupby_dataframe.index) + assert df.columns.equals(sample_groupby_dataframe.columns)