Skip to content

Commit

Permalink
Merge pull request #390 from NannyML/stats_upd
Browse files Browse the repository at this point in the history
Refactor Statistics submodule
  • Loading branch information
nnansters committed May 27, 2024
2 parents 544141d + cc869eb commit 022b8fa
Show file tree
Hide file tree
Showing 51 changed files with 1,348 additions and 533 deletions.
3 changes: 0 additions & 3 deletions nannyml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,6 @@ def calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
"""Performs a calculation on the provided data."""
try:
self._logger.debug(f"calculating {str(self)}")
data = data.copy()
return self._calculate(data, *args, **kwargs)
except NannyMLException:
raise
Expand Down Expand Up @@ -496,7 +495,6 @@ def fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> Self:
"""Trains the calculator using reference data."""
try:
self._logger.info(f"fitting {str(self)}")
reference_data = reference_data.copy()
return self._fit(reference_data, *args, **kwargs)
except NannyMLException:
raise
Expand All @@ -507,7 +505,6 @@ def estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
"""Performs a calculation on the provided data."""
try:
self._logger.info(f"estimating {str(self)}")
data = data.copy()
return self._estimate(data, *args, **kwargs)
except NannyMLException:
raise
Expand Down
2 changes: 1 addition & 1 deletion nannyml/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class NoopCalibrator(Calibrator):

def fit(self, y_pred_proba: np.ndarray, y_true: np.ndarray, *args, **kwargs):
"""Fit nothing and just return the calibrator."""
return self
pass

def calibrate(self, y_pred_proba: np.ndarray, *args, **kwargs):
"""Calibrate nothing and just return the original ``y_pred_proba`` inputs."""
Expand Down
9 changes: 2 additions & 7 deletions nannyml/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def __init__(self, chunk_size: int, incomplete: str = 'keep', timestamp_column_n

def _split(self, data: pd.DataFrame) -> List[Chunk]:
def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> Chunk:
chunk_data = data.loc[index : index + chunk_size - 1, :]
chunk_data = data.iloc[index : index + chunk_size]
chunk = Chunk(
key=f'[{index}:{index + chunk_size - 1}]',
data=chunk_data,
Expand All @@ -388,10 +388,9 @@ def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> Chunk:
chunk.end_datetime = pd.to_datetime(chunk.data[self.timestamp_column_name].max())
return chunk

data = data.copy().reset_index(drop=True)
chunks = [
_create_chunk(index=i, data=data, chunk_size=self.chunk_size)
for i in range(0, len(data), self.chunk_size)
for i in range(0, data.shape[0], self.chunk_size)
if i + self.chunk_size - 1 < len(data)
]

Expand Down Expand Up @@ -485,8 +484,6 @@ def _split(self, data: pd.DataFrame) -> List[Chunk]:
if data.shape[0] == 0:
return []

data = data.copy().reset_index()

chunk_size = data.shape[0] // self.chunk_number
chunks = SizeBasedChunker(
chunk_size=chunk_size, incomplete=self.incomplete, timestamp_column_name=self.timestamp_column_name
Expand Down Expand Up @@ -516,8 +513,6 @@ def _split(self, data: pd.DataFrame) -> List[Chunk]:
if data.shape[0] == 0:
return []

data = data.copy().reset_index(drop=True)

chunk_size = data.shape[0] // self.DEFAULT_CHUNK_COUNT
chunks = SizeBasedChunker(chunk_size=chunk_size, timestamp_column_name=self.timestamp_column_name).split(
data=data
Expand Down
4 changes: 4 additions & 0 deletions nannyml/performance_estimation/confidence_based/cbpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,8 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> CBPE:
estimator: PerformanceEstimator
The fitted estimator.
"""
reference_data = reference_data.copy(deep=True)

if self.problem_type == ProblemType.CLASSIFICATION_BINARY:
return self._fit_binary(reference_data)
elif self.problem_type == ProblemType.CLASSIFICATION_MULTICLASS:
Expand Down Expand Up @@ -352,6 +354,8 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
if data.empty:
raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')

data = data.copy(deep=True)

if self.problem_type == ProblemType.CLASSIFICATION_BINARY:
required_cols = [self.y_pred_proba]
if self.y_pred is not None:
Expand Down
4 changes: 4 additions & 0 deletions nannyml/performance_estimation/direct_loss_estimation/dle.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> Self:
if reference_data.empty:
raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')

reference_data = reference_data.copy(deep=True)

_list_missing([self.y_true, self.y_pred], list(reference_data.columns))

_, categorical_feature_columns = _split_features_by_type(reference_data, self.feature_column_names)
Expand Down Expand Up @@ -318,6 +320,8 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
if data.empty:
raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')

data = data.copy(deep=True)

_list_missing([self.y_pred], list(data.columns))

_, categorical_feature_columns = _split_features_by_type(data, self.feature_column_names)
Expand Down
56 changes: 30 additions & 26 deletions nannyml/stats/avg/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# License: Apache Software License 2.0

"""Simple Statistics Average Calculator"""
"""Simple Statistics Average Calculator."""

from typing import Any, Dict, List, Optional, Union

Expand All @@ -15,13 +15,12 @@
from nannyml.exceptions import InvalidArgumentsException
from nannyml.sampling_error import SAMPLING_ERROR_RANGE
from nannyml.stats.avg.result import Result
from nannyml.stats.base import _add_alert_flag
from nannyml.thresholds import StandardDeviationThreshold, Threshold, calculate_threshold_values
from nannyml.usage_logging import UsageEvent, log_usage


class SummaryStatsAvgCalculator(AbstractCalculator):
"""SummaryStatsAvgCalculator implementation"""
"""SummaryStatsAvgCalculator implementation."""

def __init__(
self,
Expand Down Expand Up @@ -118,20 +117,6 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
for col in self.column_names:
self._sampling_error_components[col] = reference_data[col].std()

for column in self.column_names:
reference_chunk_results = np.asarray(
[_calculate_avg_value_stats(chunk.data[column]) for chunk in self.chunker.split(reference_data)]
)
self._lower_alert_thresholds[column], self._upper_alert_thresholds[column] = calculate_threshold_values(
threshold=self.threshold,
data=reference_chunk_results,
lower_threshold_value_limit=self.lower_threshold_value_limit,
upper_threshold_value_limit=self.upper_threshold_value_limit,
logger=self._logger,
metric_name=self.simple_stats_metric,
override_using_none=True,
)

self.result = self._calculate(data=reference_data)
self.result.data[('chunk', 'period')] = 'reference'

Expand Down Expand Up @@ -173,6 +158,8 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
res = res.reset_index(drop=True)

if self.result is None:
self._set_thresholds(results=res)
res = self._populate_thresholds(results=res)
self.result = Result(
results_data=res,
column_names=self.column_names,
Expand All @@ -186,6 +173,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
# but this causes us to lose the "common behavior" in the top level 'filter' method when overriding.
# Applicable here but to many of the base classes as well (e.g. fitting and calculating)
self.result = self.result.filter(period='reference')
res = self._populate_thresholds(results=res)
self.result.data = pd.concat([self.result.data, res]).reset_index(drop=True)

return self.result
Expand All @@ -198,9 +186,6 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st
result['sampling_error'] = self._sampling_error_components[column_name] / np.sqrt(data.shape[0])
result['upper_confidence_boundary'] = result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error']
result['lower_confidence_boundary'] = result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error']
result['upper_threshold'] = self._upper_alert_thresholds[column_name]
result['lower_threshold'] = self._lower_alert_thresholds[column_name]
result['alert'] = _add_alert_flag(result)
except Exception as exc:
if self._logger:
self._logger.error(
Expand All @@ -210,12 +195,34 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st
result['sampling_error'] = np.NaN
result['upper_confidence_boundary'] = np.NaN
result['lower_confidence_boundary'] = np.NaN
result['upper_threshold'] = self._upper_alert_thresholds[column_name]
result['lower_threshold'] = self._lower_alert_thresholds[column_name]
result['alert'] = np.NaN
finally:
return result

def _set_thresholds(self, results: pd.DataFrame):
for column in self.column_names:
self._lower_alert_thresholds[column], self._upper_alert_thresholds[column] = calculate_threshold_values(
threshold=self.threshold,
data=results[(column, 'value')].to_numpy(),
lower_threshold_value_limit=self.lower_threshold_value_limit,
upper_threshold_value_limit=self.upper_threshold_value_limit,
override_using_none=True,
logger=self._logger,
metric_name=column,
)

def _populate_thresholds(self, results: pd.DataFrame):
for column in self.column_names:
results[(column, 'upper_threshold')] = self._upper_alert_thresholds[column]
results[(column, 'lower_threshold')] = self._lower_alert_thresholds[column]

lower_threshold = float('-inf') if self._lower_alert_thresholds[column] is None else self._lower_alert_thresholds[column] # noqa: E501
upper_threshold = float('inf') if self._upper_alert_thresholds[column] is None else self._upper_alert_thresholds[column] # noqa: E501
results[(column, 'alert')] = results.apply(
lambda row: not (lower_threshold < row[(column, 'value')] < upper_threshold),
axis=1,
)
return results


def _create_multilevel_index(
column_names,
Expand All @@ -230,9 +237,6 @@ def _create_multilevel_index(
'sampling_error',
'upper_confidence_boundary',
'lower_confidence_boundary',
'upper_threshold',
'lower_threshold',
'alert',
]
]
tuples = chunk_tuples + column_tuples
Expand Down
9 changes: 3 additions & 6 deletions nannyml/stats/avg/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from nannyml.base import PerColumnResult
from nannyml.chunk import Chunker

# from nannyml.exceptions import InvalidArgumentsException
from nannyml.plots.blueprints.comparisons import ResultCompareMixin
from nannyml.plots.blueprints.metrics import plot_metrics
from nannyml.usage_logging import UsageEvent, log_usage
Expand All @@ -36,13 +35,15 @@ def __init__(
timestamp_column_name: Optional[str],
chunker: Chunker,
):
"""Initalize results class."""
super().__init__(results_data, column_names)

self.timestamp_column_name = timestamp_column_name
self.simple_stats_metric = simple_stats_metric
self.chunker = chunker

def keys(self) -> List[Key]:
"""Get Keys."""
return [
Key(
properties=(column_name,),
Expand All @@ -57,10 +58,7 @@ def plot(
*args,
**kwargs,
) -> go.Figure:
"""
Parameters
----------
"""Plot results.
Returns
-------
Expand All @@ -84,7 +82,6 @@ def plot(
... res = res.filter(period='analysis', column_name=column_name).plot().show()
"""

return plot_metrics(
self,
title='Averaged Values ',
Expand Down
21 changes: 0 additions & 21 deletions nannyml/stats/base.py

This file was deleted.

Loading

0 comments on commit 022b8fa

Please sign in to comment.