Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add check for complete categories when constructing contingency tables #184

Merged
merged 14 commits into from
Mar 17, 2022
4 changes: 0 additions & 4 deletions python/metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ flood_criteria = 19200.0
forecasts['simulated_flood'] = (forecasts['sim'] >= flood_criteria)
forecasts['observed_flood'] = (forecasts['obs'] >= flood_criteria)

# Convert boolean columns to Categoricals
forecasts['simulated_flood'] = forecasts['simulated_flood'].astype('category')
forecasts['observed_flood'] = forecasts['observed_flood'].astype('category')

# Compute contingency table
contingency_table = metrics.compute_contingency_table(
forecasts['observed_flood'],
Expand Down
38 changes: 38 additions & 0 deletions python/metrics/src/hydrotools/metrics/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
---------
- raise_for_non_vector
- raise_for_inconsistent_shapes
- warn_for_nondichotomous_categories

Classes
-------
Expand All @@ -19,6 +20,9 @@
import numpy as np
import numpy.typing as npt
from typing import List, Tuple
import pandas as pd
import warnings
from pandas.api.types import CategoricalDtype

class InconsistentShapesError(Exception):
def __init__(self,
Expand Down Expand Up @@ -100,3 +104,37 @@ def raise_for_inconsistent_shapes(
array_shape_1=x.shape,
array_shape_2=y.shape
)

def convert_to_boolean_categorical_series(
data: npt.ArrayLike
) -> pd.Series:
"""
Transform data into a boolean categorical pandas.Series.

Parameters
----------
data: array-like, required
Data to convert. Should only contain True or False values.

Warnings
--------
UserWarning:
Warns if any values in data are not True or False. These values will become NaN.

Returns
-------
Validated boolean categorical series.
"""
# Create boolean categorical series
s = pd.Series(
data=data,
dtype=CategoricalDtype([True, False])
)

# Check for NaN
if s.isnull().any():
message = f"{data} contains values that could not be converted to True or False."
warnings.warn(message=message, category=UserWarning)

return s

2 changes: 1 addition & 1 deletion python/metrics/src/hydrotools/metrics/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.2.0"
__version__ = "1.2.1"
18 changes: 14 additions & 4 deletions python/metrics/src/hydrotools/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@ def kling_gupta_efficiency(
return 1.0 - EDs

def compute_contingency_table(
observed: pd.Series,
simulated: pd.Series,
observed: npt.ArrayLike,
simulated: npt.ArrayLike,
true_positive_key: str = 'true_positive',
false_positive_key: str = 'false_positive',
false_negative_key: str = 'false_negative',
Expand All @@ -195,9 +195,9 @@ def compute_contingency_table(

Parameters
----------
observed: pandas.Series, required
observed: array-like, required
jarq6c marked this conversation as resolved.
Show resolved Hide resolved
pandas.Series of boolean pandas.Categorical values indicating observed occurrences
simulated: pandas.Series, required
simulated: array-like, required
pandas.Series of boolean pandas.Categorical values indicating simulated occurrences
true_positive_key: str, optional, default 'true_positive'
Label to use for true positives.
Expand All @@ -214,6 +214,16 @@ def compute_contingency_table(
pandas.Series of integer values keyed to pandas.Index([true_positive_key, false_positive_key, false_negative_key, true_negative_key])

"""
# Raise if not 1-D arrays
validate.raise_for_non_vector(observed, simulated)

# Raise if not same shape
validate.raise_for_inconsistent_shapes(observed, simulated)

# Validate boolean categorical
observed = validate.convert_to_boolean_categorical_series(observed)
simulated = validate.convert_to_boolean_categorical_series(simulated)

# Cross tabulate
ctab = pd.crosstab(observed, simulated, dropna=False)

Expand Down
56 changes: 56 additions & 0 deletions python/metrics/tests/test_contingency_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pytest
from hydrotools.metrics import metrics

import pandas as pd

all_tp_case = pd.DataFrame({
"obs": pd.Categorical([True, True, True]),
"sim": pd.Categorical([True, True, True])
})
all_fp_case = pd.DataFrame({
"obs": pd.Categorical([False, False, False]),
"sim": pd.Categorical([True, True, True])
})
all_fn_case = pd.DataFrame({
"obs": pd.Categorical([True, True, True]),
"sim": pd.Categorical([False, False, False])
})
all_tn_case = pd.DataFrame({
"obs": pd.Categorical([False, False, False]),
"sim": pd.Categorical([False, False, False])
})

scenarios = [
(all_tp_case, "true_positive", 3),
(all_fp_case, "false_positive", 3),
(all_fn_case, "false_negative", 3),
(all_tn_case, "true_negative", 3)
]

@pytest.mark.parametrize("data,check,value", scenarios)
def test_compute_contingency_table_scenarios(data, check, value):
# Construct contingency table
table = metrics.compute_contingency_table(data["obs"], data["sim"])

# Validate correct values
for component, val in table.items():
if component == check:
assert val == value
else:
assert val == 0

def test_non_series():
obs = [True, False, True, False]
sim = [True, True, True, True]

table = metrics.compute_contingency_table(obs, sim)
assert table["true_positive"] == 2
assert table["false_positive"] == 2
assert table["false_negative"] == 0
assert table["true_negative"] == 0

POD = metrics.probability_of_detection(table)
assert POD == 1.0

POFD = metrics.probability_of_false_detection(table)
assert POFD == 1.0
8 changes: 4 additions & 4 deletions python/metrics/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@
n_pred = [np.nan, np.nan, np.nan, np.nan]

def test_compute_contingency_table():
obs = pd.Categorical([True, False, False, True, True, True,
False, False, False, False])
sim = pd.Categorical([True, True, True, False, False, False,
False, False, False, False])
obs = pd.Series([True, False, False, True, True, True,
False, False, False, False], dtype="category")
sim = pd.Series([True, True, True, False, False, False,
False, False, False, False], dtype="category")

table = metrics.compute_contingency_table(obs, sim)

Expand Down
29 changes: 29 additions & 0 deletions python/metrics/tests/test_validation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import numpy as np
from hydrotools.metrics import _validation
import pandas as pd

def test_raise_for_non_vector():
x = np.array([[1, 2, 3, 4], [1, 1, 1, 4]])
Expand All @@ -20,3 +21,31 @@ def test_raise_for_inconsistent_shapes():

with pytest.raises(_validation.InconsistentShapesError):
_validation.raise_for_inconsistent_shapes(x, y)

def test_convert_to_boolean_categorical_series():
# Check for non-Series
with pytest.warns(UserWarning):
x = _validation.convert_to_boolean_categorical_series([1, 2, 3])

# Check for non-categorical
s = pd.Series([True, True, False])
s = _validation.convert_to_boolean_categorical_series(s)
assert hasattr(s, "cat")

# Check for True
s = pd.Series([False, False, False], dtype="category")
s = _validation.convert_to_boolean_categorical_series(s)
assert True in s.cat.categories

# Check for False
s = pd.Series([False, False, False], dtype="category")
s = _validation.convert_to_boolean_categorical_series(s)
assert False in s.cat.categories

# Check for two categories
with pytest.warns(UserWarning):
s = pd.Series([True, False, "5"], dtype="category")
s = _validation.convert_to_boolean_categorical_series(s)
assert len(s.cat.categories) == 2
assert True in s.cat.categories
assert False in s.cat.categories