Change default for incomplete chunks to keep (#367)

* Change default for incomplete chunks to `keep` * Fix failing tests due to 'keep' default for Size based chunker --------- Co-authored-by: Niels Nuyttens <niels@nannyml.com>
NannyML · Feb 25, 2024 · e6cc9b6 · e6cc9b6
1 parent 3246e76
commit e6cc9b6
Show file tree

Hide file tree

Showing 8 changed files with 56 additions and 57 deletions.
diff --git a/nannyml/chunk.py b/nannyml/chunk.py
@@ -331,14 +331,14 @@ class SizeBasedChunker(Chunker):
 
     """
 
-    def __init__(self, chunk_size: int, incomplete: str = 'append', timestamp_column_name: Optional[str] = None):
+    def __init__(self, chunk_size: int, incomplete: str = 'keep', timestamp_column_name: Optional[str] = None):
         """Create a new SizeBasedChunker.
 
         Parameters
         ----------
         chunk_size: int
             The preferred size of the resulting Chunks, i.e. the number of observations in each Chunk.
-        incomplete: str, default='append'
+        incomplete: str, default='keep'
             Choose how to handle any leftover observations that don't make up a full Chunk.
             The following options are available:
 
@@ -429,7 +429,7 @@ class CountBasedChunker(Chunker):
 
     """
 
-    def __init__(self, chunk_number: int, incomplete: str = 'append', timestamp_column_name: Optional[str] = None):
+    def __init__(self, chunk_number: int, incomplete: str = 'keep', timestamp_column_name: Optional[str] = None):
         """Creates a new CountBasedChunker.
 
         It will calculate the amount of observations per chunk based on the given chunk count.
@@ -450,7 +450,7 @@ def __init__(self, chunk_number: int, incomplete: str = 'append', timestamp_colu
 
             - ``'append'``: append leftover observations to the last complete Chunk (overfilling it)
 
-            Defaults to ``'append'``.
+            Defaults to ``'keep'``.
 
         Returns
         -------

diff --git a/nannyml/config.py b/nannyml/config.py
@@ -40,12 +40,6 @@ class WriterConfig(BaseModel):
     write_args: Optional[Dict[str, Any]]
 
 
-class ChunkerConfig(BaseModel):
-    chunk_size: Optional[int]
-    chunk_period: Optional[str]
-    chunk_count: Optional[int]
-
-
 class IntervalSchedulingConfig(BaseModel):
     weeks: Optional[int]
     days: Optional[int]

diff --git a/nannyml/sampling_error/summary_stats.py b/nannyml/sampling_error/summary_stats.py
@@ -2,15 +2,16 @@
 #
 #  License: Apache Software License 2.0
 
+from logging import getLogger
 from typing import Tuple
 
 import numpy as np
 import pandas as pd
 from scipy.stats import gaussian_kde, moment
-from logging import getLogger
 
 logger = getLogger(__name__)
 
+
 def summary_stats_std_sampling_error_components(col: pd.Series) -> Tuple:
     """
     Calculate sampling error components for Summary Stats Standard Deviation
@@ -54,12 +55,11 @@ def summary_stats_std_sampling_error(sampling_error_components, col) -> float:
     _mu4 = sampling_error_components[1]
     _size = col.shape[0]
 
-    err_var_parenthesis_part = (_mu4 - ((_size - 3) * (_std**4) / (_size - 1)))
-    if not (
-        np.isfinite(err_var_parenthesis_part) and
-        err_var_parenthesis_part >= 0
-    ):
-        logger.debug("Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor.")
+    err_var_parenthesis_part = _mu4 - ((_size - 3) * (_std**4) / (_size - 1))
+    if not (np.isfinite(err_var_parenthesis_part) and err_var_parenthesis_part >= 0):
+        logger.debug(
+            "Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor."
+        )
         return np.nan
     err_var = np.sqrt((1 / _size) * err_var_parenthesis_part)
     return (1 / (2 * _std)) * err_var

diff --git a/tests/drift/test_drift.py b/tests/drift/test_drift.py
@@ -453,11 +453,11 @@ def test_statistical_drift_calculator_deals_with_missing_class_labels(sample_dri
     [
         (
             {'chunk_size': 5000},
-            [0.004968, 0.004833, 0.01186, 0.242068],
+            [0.004968, 0.004833, 0.01186, 0.243595, 0.210516],
         ),
         (
             {'chunk_size': 5000, 'timestamp_column_name': 'timestamp'},
-            [0.004968, 0.004833, 0.01186, 0.242068],
+            [0.004968, 0.004833, 0.01186, 0.243595, 0.210516],
         ),
         (
             {'chunk_number': 5},

diff --git a/tests/drift/test_multiv_pca.py b/tests/drift/test_multiv_pca.py
@@ -292,11 +292,11 @@ def test_data_reconstruction_drift_calculator_numeric_results(sample_drift_data)
     [
         (
             {'chunk_size': 5000},
-            [0.7998744001719177, 0.8020996183121666, 0.8043000024523013, 0.73631],
+            [0.79987, 0.80210, 0.80430, 0.73552, 0.76087],
         ),
         (
             {'chunk_size': 5000, 'timestamp_column_name': 'timestamp'},
-            [0.7998744001719177, 0.8020996183121666, 0.8043000024523013, 0.73631],
+            [0.79987, 0.80210, 0.80430, 0.73552, 0.76087],
         ),
         (
             {'chunk_number': 5},

diff --git a/tests/performance_estimation/CBPE/test_cbpe_metrics.py b/tests/performance_estimation/CBPE/test_cbpe_metrics.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
 
-from nannyml.chunk import DefaultChunker
+from nannyml.chunk import DefaultChunker, SizeBasedChunker
 from nannyml.datasets import (
     load_synthetic_binary_classification_dataset,
     load_synthetic_multiclass_classification_dataset,
@@ -24,7 +24,7 @@
     [
         (
             {
-                'chunk_size': 20000,
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
                 'normalize_confusion_matrix': None,
                 'business_value_matrix': [[2, -5], [-10, 10]],
                 'normalize_business_value': None,
@@ -48,7 +48,7 @@
         ),
         (
             {
-                'chunk_size': 20000,
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
                 'normalize_confusion_matrix': None,
                 'business_value_matrix': [[2, -5], [-10, 10]],
                 'normalize_business_value': 'per_prediction',
@@ -71,7 +71,11 @@
             ),
         ),
         (
-            {'chunk_size': 20000, 'normalize_confusion_matrix': 'all', 'business_value_matrix': [[-1, 4], [8, -8]]},
+            {
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
+                'normalize_confusion_matrix': 'all',
+                'business_value_matrix': [[-1, 4], [8, -8]],
+            },
             pd.DataFrame(
                 {
                     'key': ['[0:19999]', '[20000:49999]'],
@@ -90,7 +94,11 @@
             ),
         ),
         (
-            {'chunk_size': 20000, 'normalize_confusion_matrix': 'true', 'business_value_matrix': [[-1, 4], [8, -8]]},
+            {
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
+                'normalize_confusion_matrix': 'true',
+                'business_value_matrix': [[-1, 4], [8, -8]],
+            },
             pd.DataFrame(
                 {
                     'key': ['[0:19999]', '[20000:49999]'],
@@ -109,7 +117,11 @@
             ),
         ),
         (
-            {'chunk_size': 20000, 'normalize_confusion_matrix': 'pred', 'business_value_matrix': [[-1, 4], [8, -8]]},
+            {
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
+                'normalize_confusion_matrix': 'pred',
+                'business_value_matrix': [[-1, 4], [8, -8]],
+            },
             pd.DataFrame(
                 {
                     'key': ['[0:19999]', '[20000:49999]'],
@@ -129,7 +141,7 @@
         ),
         (
             {
-                'chunk_size': 20000,
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
                 'normalize_confusion_matrix': None,
                 'timestamp_column_name': 'timestamp',
                 'business_value_matrix': [[-1, 4], [8, -8]],
@@ -153,7 +165,7 @@
         ),
         (
             {
-                'chunk_size': 20000,
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
                 'normalize_confusion_matrix': 'all',
                 'timestamp_column_name': 'timestamp',
                 'business_value_matrix': [[-1, 4], [8, -8]],
@@ -177,7 +189,7 @@
         ),
         (
             {
-                'chunk_size': 20000,
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
                 'normalize_confusion_matrix': 'all',
                 'timestamp_column_name': 'timestamp',
                 'business_value_matrix': [[2, -5], [-10, 10]],
@@ -202,7 +214,7 @@
         ),
         (
             {
-                'chunk_size': 20000,
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
                 'normalize_confusion_matrix': 'true',
                 'timestamp_column_name': 'timestamp',
                 'business_value_matrix': [[-1, 4], [8, -8]],
@@ -226,7 +238,7 @@
         ),
         (
             {
-                'chunk_size': 20000,
+                'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
                 'normalize_confusion_matrix': 'pred',
                 'timestamp_column_name': 'timestamp',
                 'business_value_matrix': [[-1, 4], [8, -8]],

diff --git a/tests/stats/test_std.py b/tests/stats/test_std.py
@@ -5,14 +5,13 @@
 
 """Tests for Drift package."""
 
-import pytest
-import pandas as pd
 import numpy as np
+import pandas as pd
+import pytest
 
-
+from nannyml.chunk import SizeBasedChunker
 from nannyml.datasets import load_synthetic_car_loan_dataset
 from nannyml.stats import SummaryStatsStdCalculator
-from nannyml.chunk import SizeBasedChunker
 
 # @pytest.fixture(scope="module")
 # def status_sum_result() -> Result:
@@ -43,30 +42,24 @@ def test_stats_std_calculator_with_default_params_chunk_size_one():  # noqa: D10
     reference, analysis, _ = load_synthetic_car_loan_dataset()
 
     chunker = SizeBasedChunker(chunk_size=5_000, incomplete='keep')
-    calc = SummaryStatsStdCalculator(
-        column_names=['car_value'],
-        chunker=chunker
-    ).fit(reference)
+    calc = SummaryStatsStdCalculator(column_names=['car_value'], chunker=chunker).fit(reference)
     result = calc.calculate(data=analysis.head(5_001))
     expected = pd.DataFrame(
         {
             ('chunk', 'key'): ['[0:4999]', '[5000:5000]'],
-            ('chunk', 'chunk_index'): [0,1],
-            ('chunk', 'start_index'): [0,5000],
-            ('chunk', 'end_index'): [4999,5000],
-            ('chunk', 'start_date'): [None,None],
-            ('chunk', 'end_date'): [None,None],
-            ('chunk', 'period'): ['analysis','analysis'],
-            ('car_value', 'value'): [20614.8926,np.nan],
-            ('car_value', 'sampling_error'): [271.9917,np.nan],
-            ('car_value', 'upper_confidence_boundary'): [21430.8679,np.nan],
-            ('car_value', 'lower_confidence_boundary'): [19798.9174,np.nan],
+            ('chunk', 'chunk_index'): [0, 1],
+            ('chunk', 'start_index'): [0, 5000],
+            ('chunk', 'end_index'): [4999, 5000],
+            ('chunk', 'start_date'): [None, None],
+            ('chunk', 'end_date'): [None, None],
+            ('chunk', 'period'): ['analysis', 'analysis'],
+            ('car_value', 'value'): [20614.8926, np.nan],
+            ('car_value', 'sampling_error'): [271.9917, np.nan],
+            ('car_value', 'upper_confidence_boundary'): [21430.8679, np.nan],
+            ('car_value', 'lower_confidence_boundary'): [19798.9174, np.nan],
             ('car_value', 'upper_threshold'): [20978.5658, 20978.5658],
             ('car_value', 'lower_threshold'): [19816.9091, 19816.9091],
             ('car_value', 'alert'): [False, True],
         }
     )
-    pd.testing.assert_frame_equal(
-        expected,
-        result.filter(period='analysis').to_df().round(4)
-    )
+    pd.testing.assert_frame_equal(expected, result.filter(period='analysis').to_df().round(4))
diff --git a/tests/test_chunk.py b/tests/test_chunk.py
@@ -241,12 +241,12 @@ def test_size_based_chunker_returns_chunks_of_required_size(sample_chunk_data):
     chunker = SizeBasedChunker(chunk_size=chunk_size)
     sut = chunker.split(sample_chunk_data)
     assert len(sut[0]) == chunk_size
-    assert len(sut) == math.ceil(sample_chunk_data.shape[0] / chunk_size) - 1
+    assert len(sut) == math.ceil(sample_chunk_data.shape[0] / chunk_size)
 
 
 def test_size_based_chunker_returns_last_chunk_that_is_partially_filled(sample_chunk_data):  # noqa: D103
     chunk_size = 3333
-    expected_last_chunk_size = chunk_size + sample_chunk_data.shape[0] % chunk_size
+    expected_last_chunk_size = sample_chunk_data.shape[0] % chunk_size
     chunker = SizeBasedChunker(chunk_size)
     sut = chunker.split(sample_chunk_data)
     assert len(sut[-1]) == expected_last_chunk_size
@@ -304,7 +304,7 @@ def test_size_based_chunker_uses_observations_to_set_chunk_date_boundaries(sampl
 
 def test_size_based_chunker_assigns_observation_range_to_chunk_keys(sample_chunk_data):  # noqa: D103
     chunk_size = 1500
-    last_chunk_start = ((sample_chunk_data.shape[0] // chunk_size) - 1) * chunk_size
+    last_chunk_start = (sample_chunk_data.shape[0] // chunk_size) * chunk_size
     last_chunk_end = sample_chunk_data.shape[0] - 1
 
     chunker = SizeBasedChunker(chunk_size=chunk_size)