Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add check for complete categories when constructing contingency tables #184

Merged
merged 14 commits into from
Mar 17, 2022
4 changes: 0 additions & 4 deletions python/metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ flood_criteria = 19200.0
forecasts['simulated_flood'] = (forecasts['sim'] >= flood_criteria)
forecasts['observed_flood'] = (forecasts['obs'] >= flood_criteria)

# Convert boolean columns to Categoricals
forecasts['simulated_flood'] = forecasts['simulated_flood'].astype('category')
forecasts['observed_flood'] = forecasts['observed_flood'].astype('category')

# Compute contingency table
contingency_table = metrics.compute_contingency_table(
forecasts['observed_flood'],
Expand Down
65 changes: 19 additions & 46 deletions python/metrics/src/hydrotools/metrics/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from typing import List, Tuple
import pandas as pd
import warnings
from pandas.api.types import CategoricalDtype

class InconsistentShapesError(Exception):
def __init__(self,
Expand Down Expand Up @@ -104,64 +105,36 @@ def raise_for_inconsistent_shapes(
array_shape_2=y.shape
)

def validate_boolean_categorical_series(
series: pd.Series
) -> None:
def convert_to_boolean_categorical_series(
data: npt.ArrayLike
) -> pd.Series:
"""
Validate that series is a boolean categorical variable.
Transform data into a boolean categorical pandas.Series.

Parameters
----------
series: pandas.Series, required
Series to validate.
data: array-like, required
Data to convert. Should only contain True or False values.

Warnings
--------
UserWarning:
Warns if series is not a boolean Categorical and attempts to convert.

Raises
------
pandas.errors.UnsupportedFunctionCall:
Raises if series is not a pandas.Series or if series does not contain two categories: True and False
Warns if any values in data are not True or False. These values will become NaN.

Returns
-------
Validated boolean categorical series.
"""
# Check for Series
if not isinstance(series, pd.Series):
raise pd.errors.UnsupportedFunctionCall(f"{series} is not a pandas.Series")

# Check for categorical
if not hasattr(series, "cat"):
message = f"{series} is not a categorical pandas.Series, attempting to convert"
warnings.warn(message=message, category=UserWarning)
series = series.astype("category")

# Check for True category
no_true_category = True
for c in series.cat.categories:
if c is True:
no_true_category = False
if no_true_category:
message = f"{series} has no True category, attempting to add"
# Create boolean categorical series
s = pd.Series(
data=data,
dtype=CategoricalDtype([True, False])
)

# Check for NaN
if s.isnull().any():
message = f"{data} contains values that could not be converted to True or False."
warnings.warn(message=message, category=UserWarning)
series = series.cat.add_categories([True])

# Check for False category
no_false_category = True
for c in series.cat.categories:
if c is False:
no_false_category = False
if no_false_category:
message = f"{series} has no False category, attempting to add"
warnings.warn(message=message, category=UserWarning)
series = series.cat.add_categories([False])

# Check number of categories
if len(series.cat.categories) != 2:
raise pd.errors.UnsupportedFunctionCall(f"{series} does not have exactly two categories: True and False")

# Return validated series
return series
return s

12 changes: 6 additions & 6 deletions python/metrics/src/hydrotools/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@ def kling_gupta_efficiency(
return 1.0 - EDs

def compute_contingency_table(
observed: pd.Series,
simulated: pd.Series,
observed: npt.ArrayLike,
simulated: npt.ArrayLike,
true_positive_key: str = 'true_positive',
false_positive_key: str = 'false_positive',
false_negative_key: str = 'false_negative',
Expand All @@ -195,9 +195,9 @@ def compute_contingency_table(

Parameters
----------
observed: pandas.Series, required
observed: array-like, required
jarq6c marked this conversation as resolved.
Show resolved Hide resolved
pandas.Series of boolean pandas.Categorical values indicating observed occurrences
simulated: pandas.Series, required
simulated: array-like, required
pandas.Series of boolean pandas.Categorical values indicating simulated occurrences
true_positive_key: str, optional, default 'true_positive'
Label to use for true positives.
Expand All @@ -221,8 +221,8 @@ def compute_contingency_table(
validate.raise_for_inconsistent_shapes(observed, simulated)

# Validate boolean categorical
observed = validate.validate_boolean_categorical_series(observed)
simulated = validate.validate_boolean_categorical_series(simulated)
observed = validate.convert_to_boolean_categorical_series(observed)
simulated = validate.convert_to_boolean_categorical_series(simulated)

# Cross tabulate
ctab = pd.crosstab(observed, simulated, dropna=False)
Expand Down
19 changes: 17 additions & 2 deletions python/metrics/tests/test_contingency_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,27 @@
@pytest.mark.parametrize("data,check,value", scenarios)
def test_compute_contingency_table_scenarios(data, check, value):
# Construct contingency table
with pytest.warns(UserWarning):
table = metrics.compute_contingency_table(data["obs"], data["sim"])
table = metrics.compute_contingency_table(data["obs"], data["sim"])

# Validate correct values
for component, val in table.items():
if component == check:
assert val == value
else:
assert val == 0

def test_non_series():
obs = [True, False, True, False]
sim = [True, True, True, True]

table = metrics.compute_contingency_table(obs, sim)
assert table["true_positive"] == 2
assert table["false_positive"] == 2
assert table["false_negative"] == 0
assert table["true_negative"] == 0

POD = metrics.probability_of_detection(table)
assert POD == 1.0

POFD = metrics.probability_of_false_detection(table)
assert POFD == 1.0
34 changes: 17 additions & 17 deletions python/metrics/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,30 @@ def test_raise_for_inconsistent_shapes():
with pytest.raises(_validation.InconsistentShapesError):
_validation.raise_for_inconsistent_shapes(x, y)

def test_validate_boolean_categorical_series():
def test_convert_to_boolean_categorical_series():
# Check for non-Series
with pytest.raises(pd.errors.UnsupportedFunctionCall):
x = _validation.validate_boolean_categorical_series([1, 2, 3])
with pytest.warns(UserWarning):
x = _validation.convert_to_boolean_categorical_series([1, 2, 3])

# Check for non-categorical
with pytest.warns(UserWarning):
s = pd.Series([True, True, False])
s = _validation.validate_boolean_categorical_series(s)
assert hasattr(s, "cat")
s = pd.Series([True, True, False])
s = _validation.convert_to_boolean_categorical_series(s)
assert hasattr(s, "cat")

# Check for True
with pytest.warns(UserWarning):
s = pd.Series([False, False, False], dtype="category")
s = _validation.validate_boolean_categorical_series(s)
assert True in s.cat.categories
s = pd.Series([False, False, False], dtype="category")
s = _validation.convert_to_boolean_categorical_series(s)
assert True in s.cat.categories

# Check for False
with pytest.warns(UserWarning):
s = pd.Series([False, False, False], dtype="category")
s = _validation.validate_boolean_categorical_series(s)
assert False in s.cat.categories
s = pd.Series([False, False, False], dtype="category")
s = _validation.convert_to_boolean_categorical_series(s)
assert False in s.cat.categories

# Check for two categories
with pytest.raises(pd.errors.UnsupportedFunctionCall):
with pytest.warns(UserWarning):
s = pd.Series([True, False, "5"], dtype="category")
s = _validation.validate_boolean_categorical_series(s)
s = _validation.convert_to_boolean_categorical_series(s)
assert len(s.cat.categories) == 2
assert True in s.cat.categories
assert False in s.cat.categories