Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add check for complete categories when constructing contingency tables #184

Merged
merged 14 commits into from
Mar 17, 2022
65 changes: 65 additions & 0 deletions python/metrics/src/hydrotools/metrics/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
---------
- raise_for_non_vector
- raise_for_inconsistent_shapes
- warn_for_nondichotomous_categories

Classes
-------
Expand All @@ -19,6 +20,8 @@
import numpy as np
import numpy.typing as npt
from typing import List, Tuple
import pandas as pd
import warnings

class InconsistentShapesError(Exception):
def __init__(self,
Expand Down Expand Up @@ -100,3 +103,65 @@ def raise_for_inconsistent_shapes(
array_shape_1=x.shape,
array_shape_2=y.shape
)

def validate_boolean_categorical_series(
series: pd.Series
) -> None:
"""
Validate that series is a boolean categorical variable.

Parameters
----------
series: pandas.Series, required
Series to validate.

Warnings
--------
UserWarning:
Warns if series is not a boolean Categorical and attempts to convert.

Raises
------
pandas.errors.UnsupportedFunctionCall:
Raises if series is not a pandas.Series or if series does not contain two categories: True and False

Returns
-------
Validated boolean categorical series.
"""
# Check for Series
if not isinstance(series, pd.Series):
raise pd.errors.UnsupportedFunctionCall(f"{series} is not a pandas.Series")

# Check for categorical
if not hasattr(series, "cat"):
message = f"{series} is not a categorical pandas.Series, attempting to convert"
warnings.warn(message=message, category=UserWarning)
series = series.astype("category")

# Check for True category
no_true_category = True
for c in series.cat.categories:
if c is True:
no_true_category = False
if no_true_category:
message = f"{series} has no True category, attempting to add"
jarq6c marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(message=message, category=UserWarning)
series = series.cat.add_categories([True])

# Check for False category
no_false_category = True
for c in series.cat.categories:
if c is False:
no_false_category = False
if no_false_category:
message = f"{series} has no False category, attempting to add"
warnings.warn(message=message, category=UserWarning)
series = series.cat.add_categories([False])

# Check number of categories
jarq6c marked this conversation as resolved.
Show resolved Hide resolved
if len(series.cat.categories) != 2:
raise pd.errors.UnsupportedFunctionCall(f"{series} does not have exactly two categories: True and False")

# Return validated series
return series
2 changes: 1 addition & 1 deletion python/metrics/src/hydrotools/metrics/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.2.0"
__version__ = "1.2.1"
10 changes: 10 additions & 0 deletions python/metrics/src/hydrotools/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,16 @@ def compute_contingency_table(
pandas.Series of integer values keyed to pandas.Index([true_positive_key, false_positive_key, false_negative_key, true_negative_key])

"""
# Raise if not 1-D arrays
validate.raise_for_non_vector(observed, simulated)

# Raise if not same shape
validate.raise_for_inconsistent_shapes(observed, simulated)

# Validate boolean categorical
observed = validate.validate_boolean_categorical_series(observed)
simulated = validate.validate_boolean_categorical_series(simulated)

# Cross tabulate
ctab = pd.crosstab(observed, simulated, dropna=False)

Expand Down
41 changes: 41 additions & 0 deletions python/metrics/tests/test_contingency_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pytest
from hydrotools.metrics import metrics

import pandas as pd

all_tp_case = pd.DataFrame({
"obs": pd.Categorical([True, True, True]),
"sim": pd.Categorical([True, True, True])
})
all_fp_case = pd.DataFrame({
"obs": pd.Categorical([False, False, False]),
"sim": pd.Categorical([True, True, True])
})
all_fn_case = pd.DataFrame({
"obs": pd.Categorical([True, True, True]),
"sim": pd.Categorical([False, False, False])
})
all_tn_case = pd.DataFrame({
"obs": pd.Categorical([False, False, False]),
"sim": pd.Categorical([False, False, False])
})

scenarios = [
(all_tp_case, "true_positive", 3),
(all_fp_case, "false_positive", 3),
(all_fn_case, "false_negative", 3),
(all_tn_case, "true_negative", 3)
]

@pytest.mark.parametrize("data,check,value", scenarios)
def test_compute_contingency_table_scenarios(data, check, value):
# Construct contingency table
with pytest.warns(UserWarning):
table = metrics.compute_contingency_table(data["obs"], data["sim"])

# Validate correct values
for component, val in table.items():
if component == check:
assert val == value
else:
assert val == 0
8 changes: 4 additions & 4 deletions python/metrics/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@
n_pred = [np.nan, np.nan, np.nan, np.nan]

def test_compute_contingency_table():
obs = pd.Categorical([True, False, False, True, True, True,
False, False, False, False])
sim = pd.Categorical([True, True, True, False, False, False,
False, False, False, False])
obs = pd.Series([True, False, False, True, True, True,
False, False, False, False], dtype="category")
sim = pd.Series([True, True, True, False, False, False,
False, False, False, False], dtype="category")

table = metrics.compute_contingency_table(obs, sim)

Expand Down
29 changes: 29 additions & 0 deletions python/metrics/tests/test_validation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import numpy as np
from hydrotools.metrics import _validation
import pandas as pd

def test_raise_for_non_vector():
x = np.array([[1, 2, 3, 4], [1, 1, 1, 4]])
Expand All @@ -20,3 +21,31 @@ def test_raise_for_inconsistent_shapes():

with pytest.raises(_validation.InconsistentShapesError):
_validation.raise_for_inconsistent_shapes(x, y)

def test_validate_boolean_categorical_series():
# Check for non-Series
with pytest.raises(pd.errors.UnsupportedFunctionCall):
x = _validation.validate_boolean_categorical_series([1, 2, 3])

# Check for non-categorical
with pytest.warns(UserWarning):
s = pd.Series([True, True, False])
s = _validation.validate_boolean_categorical_series(s)
assert hasattr(s, "cat")

# Check for True
with pytest.warns(UserWarning):
s = pd.Series([False, False, False], dtype="category")
s = _validation.validate_boolean_categorical_series(s)
assert True in s.cat.categories

# Check for False
with pytest.warns(UserWarning):
s = pd.Series([False, False, False], dtype="category")
s = _validation.validate_boolean_categorical_series(s)
assert False in s.cat.categories

# Check for two categories
with pytest.raises(pd.errors.UnsupportedFunctionCall):
s = pd.Series([True, False, "5"], dtype="category")
s = _validation.validate_boolean_categorical_series(s)