Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Kling-Gupta Efficiency #172

Merged
merged 11 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
102 changes: 102 additions & 0 deletions python/metrics/src/hydrotools/metrics/_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""
=============================
Method Validation for Metrics
=============================
Various methods and objects for validating metric inputs.

Functions
---------
- raise_for_non_vector
- raise_for_inconsistent_shapes

Classes
-------
- InconsistentShapesError
- NonVectorError

"""

import numpy as np
import numpy.typing as npt
from typing import List, Tuple

class InconsistentShapesError(Exception):
def __init__(self,
*args: object,
array_shape_1: Tuple[int],
array_shape_2: Tuple[int]
) -> None:
self._array_shape_1 = array_shape_1
self._array_shape_2 = array_shape_2
super().__init__(*args)

def __str__(self) -> str:
a1s = self._array_shape_1
a2s = self._array_shape_2
message = f"Arrays are not the same shape. The array shapes are {a1s} and {a2s}."
return message

class NonVectorError(Exception):
def __init__(self,
*args: object,
arr: npt.ArrayLike
) -> None:
self._arr = arr
super().__init__(*args)

def __str__(self) -> str:
message = f"This array is not 1-dimensional\n {self._arr}"
return message

def raise_for_non_vector(
*arrays: List[npt.ArrayLike]
) -> None:
"""
Validate that all arrays are 1-dimensional.

Parameters
----------
arrays: arbitrary number of array-like values, required
Arrays to check.

Raises
------
NonVectorError:
Raises if one of the arrays is not 1-dimensional.
"""
# Check each array
for x in arrays:
if len(np.array(x).shape) != 1:
raise NonVectorError(arr=x)

def raise_for_inconsistent_shapes(
*arrays: List[npt.ArrayLike]
) -> None:
"""
Validate that all arrays are the same shape.

Parameters
----------
arrays: arbitrary number of array-like values, required
Arrays to check.

Raises
------
InconsistentShapesError:
Raises if all of the arrays are not the same shape.
"""
# Convert to numpy arrays
arrays = [np.array(x) for x in arrays]

# Extract first array
x = arrays[0]

# Check shape of each
for y in arrays:
# Test each array
test = all(i == j for i, j in zip(x.shape, y.shape))
if not test:
raise InconsistentShapesError(
array_shape_1=x.shape,
array_shape_2=y.shape
)
2 changes: 1 addition & 1 deletion python/metrics/src/hydrotools/metrics/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.1.3"
__version__ = "1.2.0"
81 changes: 76 additions & 5 deletions python/metrics/src/hydrotools/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@
- equitable_threat_score
- mean_squared_error
- nash_sutcliffe_efficiency
- kling_gupta_efficiency

"""

import numpy as np
import numpy.typing as npt
import pandas as pd
from typing import Union, Mapping, MutableMapping
from . import _validation as validate

def mean_squared_error(
y_true: npt.ArrayLike,
Expand All @@ -39,9 +41,9 @@ def mean_squared_error(

Parameters
----------
y_true: array-like of shape (n_samples,) or (n_samples, n_outputs)
y_true: array-like of shape (n_samples,), required
Ground truth (correct) target values, also called observations, measurements, or observed values.
y_pred: pandas.Series, required
y_pred: array-like of shape (n_samples,), required
Estimated target values, also called simulations or modeled values.
root: bool, default False
When True, return the root mean squared error.
Expand All @@ -66,14 +68,14 @@ def nash_sutcliffe_efficiency(
log: bool = False,
normalized: bool = False
) -> float:
"""Compute the NashSutcliffe model efficiency coefficient (NSE), also called the
"""Compute the Nash-Sutcliffe model efficiency coefficient (NSE), also called the
mean squared error skill score or the R^2 (coefficient of determination) regression score.

Parameters
----------
y_true: array-like of shape (n_samples,) or (n_samples, n_outputs)
y_true: array-like of shape (n_samples,), required
Ground truth (correct) target values, also called observations, measurements, or observed values.
y_pred: pandas.Series, required
y_pred: array-like of shape (n_samples,), required
Estimated target values, also called simulations or modeled values.
log: bool, default False
Apply numpy.log (natural logarithm) to y_true and y_pred
Expand All @@ -98,6 +100,12 @@ def nash_sutcliffe_efficiency(
Conference Abstracts (p. 237).

"""
# Raise if not 1-D arrays
validate.raise_for_non_vector(y_true, y_pred)

# Raise if not same shape
validate.raise_for_inconsistent_shapes(y_true, y_pred)

# Optionally transform components
if log:
y_true = np.log(y_true)
Expand All @@ -112,6 +120,69 @@ def nash_sutcliffe_efficiency(
return 1.0 / (1.0 + numerator/denominator)
return 1.0 - numerator/denominator

def kling_gupta_efficiency(
y_true: npt.ArrayLike,
y_pred: npt.ArrayLike,
r_scale: float = 1.0,
a_scale: float = 1.0,
b_scale: float = 1.0
) -> float:
"""Compute the Kling-Gupta model efficiency coefficient (KGE).
Copy link
Member

@aaraney aaraney Feb 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should any software assumptions be stated here? Something like: "This function assumes two sorted arrays of equal length." I would also want to add something that conveys that y_true[0] "lines up with" y_pred[0]. I cant find an appropriate phrase though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth mentioning what the shape of the array's are assumed to be. i.e. this method should only be applied to 1 dimensional arrays.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This made this realize, the docstring description for these doesn't make any sense. I copied these from sklearn and forgot to update them for this case.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated docstrings and added new input array validation methods for NSE and KGE. These could certainly be way more rigorous and exhaustive, but it's a start.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: the parameters descriptions explicitly state the expected shape ((n_samples,)).


Parameters
----------
y_true: array-like of shape (n_samples,), required
Ground truth (correct) target values, also called observations, measurements, or observed values.
y_pred: array-like of shape (n_samples,), required
Estimated target values, also called simulations or modeled values.
r_scale: float, optional, default 1.0
Linear correlation (r) scaling factor. Used to re-scale the Euclidean space by
emphasizing different KGE components.
a_scale: float, optional, default 1.0
Relative variability (alpha) scaling factor. Used to re-scale the Euclidean space by
emphasizing different KGE components.
b_scale: float, optional, default 1.0
Relative mean (beta) scaling factor. Used to re-scale the Euclidean space by
emphasizing different KGE components.

Returns
-------
score: float
Kling-Gupta efficiency.

References
----------
Gupta, H. V., Kling, H., Yilmaz, K. K., & Martinez, G. F. (2009). Decomposition of
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

@aaraney aaraney Feb 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on a side note, it would be "cool" if you could get the citation for the paper in bibtex or some other format. I am thinking a decorator that wraps each metric function and contains the bibtex formatted citation. You could access it like:

from hydrotools.metrics import metrics

print(metrics.kling_gupta_efficiency.citation(format="bibtex"))

@article{Gupta2009,
  doi = {10.1016/j.jhydrol.2009.08.003},
  url = {https://doi.org/10.1016/j.jhydrol.2009.08.003},
  year = {2009},
  month = oct,
  publisher = {Elsevier {BV}},
  volume = {377},
  number = {1-2},
  pages = {80--91},
  author = {Hoshin V. Gupta and Harald Kling and Koray K. Yilmaz and Guillermo F. Martinez},
  title = {Decomposition of the mean squared error and {NSE} performance criteria: Implications for improving hydrological modelling},
  journal = {Journal of Hydrology}
}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be cool. Let's discuss that at next hydrotools meeting!

the mean squared error and NSE performance criteria: Implications for improving
hydrological modelling. Journal of hydrology, 377(1-2), 80-91.
https://doi.org/10.1016/j.jhydrol.2009.08.003

"""
# Raise if not 1-D arrays
validate.raise_for_non_vector(y_true, y_pred)

# Raise if not same shape
validate.raise_for_inconsistent_shapes(y_true, y_pred)

# Pearson correlation coefficient
r = np.corrcoef(y_pred, y_true)[0,1]

# Relative variability
a = np.std(y_pred) / np.std(y_true)

# Relative mean
b = np.mean(y_pred) / np.mean(y_true)

# Scaled Euclidean distance
EDs = np.sqrt(
(r_scale * (r - 1.0)) ** 2.0 +
(a_scale * (a - 1.0)) ** 2.0 +
(b_scale * (b - 1.0)) ** 2.0
)

# Return KGE
return 1.0 - EDs

def compute_contingency_table(
observed: pd.Series,
simulated: pd.Series,
Expand Down
28 changes: 27 additions & 1 deletion python/metrics/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,4 +316,30 @@ def test_convert_mapping_values():
converted = metrics.convert_mapping_values(char_series)

assert converted.dtype == np.float64


def test_kling_gupta_efficiency():
# Default (inverse case)
KGE = metrics.kling_gupta_efficiency(y_true, y_pred)
expected = -1.0
assert np.isclose(KGE, expected)

# Half-scale
KGE = metrics.kling_gupta_efficiency(y_true, np.array(y_true) * 0.5)
expected = (1.0 - np.sqrt(0.5))
assert np.isclose(KGE, expected)

# Double-scale
KGE = metrics.kling_gupta_efficiency(y_true, np.array(y_true) * 2.0)
expected = (1.0 - np.sqrt(2.0))
assert np.isclose(KGE, expected)

# Identity
KGE = metrics.kling_gupta_efficiency(y_true, np.array(y_true) * 1.0)
expected = 1.0
assert np.isclose(KGE, expected)

# Scaling parameters
KGE = metrics.kling_gupta_efficiency(y_true, np.array(y_true) * 0.25,
r_scale=0.5, a_scale=0.25, b_scale=0.25)
expected = (1.0 - np.sqrt(9.0/128.0))
assert np.isclose(KGE, expected)
22 changes: 22 additions & 0 deletions python/metrics/tests/test_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pytest
import numpy as np
from hydrotools.metrics import _validation

def test_raise_for_non_vector():
x = np.array([[1, 2, 3, 4], [1, 1, 1, 4]])
y = [1, 2, 3, 4]

_validation.raise_for_non_vector(y)

with pytest.raises(_validation.NonVectorError):
_validation.raise_for_non_vector(x)

def test_raise_for_inconsistent_shapes():
x = np.array([[1, 2, 3, 4], [1, 1, 1, 4]])
y = [1, 2, 3, 4]
z = [5, 6, 7, 8]

_validation.raise_for_inconsistent_shapes(y, z)

with pytest.raises(_validation.InconsistentShapesError):
_validation.raise_for_inconsistent_shapes(x, y)