Skip to content

Commit

Permalink
[SPARK-48842][DOCS] Document non-determinism of max_by and min_by
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Document non-determinism of max_by and min_by

### Why are the changes needed?
I have been confused by this non-determinism twice, it occurred like a correctness bug to me.
So I think we need to document it

### Does this PR introduce _any_ user-facing change?
doc change only

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes apache#47266 from zhengruifeng/py_doc_max_by.

Authored-by: Ruifeng Zheng <ruifengz@apache.org>
Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
  • Loading branch information
zhengruifeng committed Jul 12, 2024
1 parent 0e940e2 commit 5bbe9c8
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 0 deletions.
6 changes: 6 additions & 0 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,9 @@ setMethod("max",
#' @details
#' \code{max_by}: Returns the value associated with the maximum value of ord.
#'
#' Note: The function is non-deterministic so the output order can be different
#' for those associated the same values of `x`.
#'
#' @rdname column_aggregate_functions
#' @aliases max_by max_by,Column-method
#' @note max_by since 3.3.0
Expand Down Expand Up @@ -1633,6 +1636,9 @@ setMethod("min",
#' @details
#' \code{min_by}: Returns the value associated with the minimum value of ord.
#'
#' Note: The function is non-deterministic so the output order can be different
#' for those associated the same values of `x`.
#'
#' @rdname column_aggregate_functions
#' @aliases min_by min_by,Column-method
#' @note min_by since 3.3.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,10 @@ object functions {
/**
* Aggregate function: returns the value associated with the maximum value of ord.
*
* @note
* The function is non-deterministic so the output order can be different for those associated
* the same values of `e`.
*
* @group agg_funcs
* @since 3.4.0
*/
Expand Down Expand Up @@ -932,6 +936,10 @@ object functions {
/**
* Aggregate function: returns the value associated with the minimum value of ord.
*
* @note
* The function is non-deterministic so the output order can be different for those associated
* the same values of `e`.
*
* @group agg_funcs
* @since 3.4.0
*/
Expand Down
10 changes: 10 additions & 0 deletions python/pyspark/sql/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,6 +1271,11 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
.. versionchanged:: 3.4.0
Supports Spark Connect.

Notes
-----
The function is non-deterministic so the output order can be different for those
associated the same values of `col`.

Parameters
----------
col : :class:`~pyspark.sql.Column` or str
Expand Down Expand Up @@ -1352,6 +1357,11 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
.. versionchanged:: 3.4.0
Supports Spark Connect.

Notes
-----
The function is non-deterministic so the output order can be different for those
associated the same values of `col`.

Parameters
----------
col : :class:`~pyspark.sql.Column` or str
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ abstract class MaxMinBy extends DeclarativeAggregate with BinaryLike[Expression]
> SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y);
b
""",
note = """
The function is non-deterministic so the output order can be different for
those associated the same values of `x`.
""",
group = "agg_funcs",
since = "3.0.0")
case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy {
Expand All @@ -122,6 +126,10 @@ case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMin
> SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y);
a
""",
note = """
The function is non-deterministic so the output order can be different for
those associated the same values of `x`.
""",
group = "agg_funcs",
since = "3.0.0")
case class MinBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy {
Expand Down
6 changes: 6 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,9 @@ object functions {
/**
* Aggregate function: returns the value associated with the maximum value of ord.
*
* @note The function is non-deterministic so the output order can be different for
* those associated the same values of `e`.
*
* @group agg_funcs
* @since 3.3.0
*/
Expand Down Expand Up @@ -952,6 +955,9 @@ object functions {
/**
* Aggregate function: returns the value associated with the minimum value of ord.
*
* @note The function is non-deterministic so the output order can be different for
* those associated the same values of `e`.
*
* @group agg_funcs
* @since 3.3.0
*/
Expand Down

0 comments on commit 5bbe9c8

Please sign in to comment.