From 5bbe9c850aaaf31327b81d893ed513033a129e08 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 12 Jul 2024 12:41:07 +0800 Subject: [PATCH] [SPARK-48842][DOCS] Document non-determinism of max_by and min_by ### What changes were proposed in this pull request? Document non-determinism of max_by and min_by ### Why are the changes needed? I have been confused by this non-determinism twice, it occurred like a correctness bug to me. So I think we need to document it ### Does this PR introduce _any_ user-facing change? doc change only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #47266 from zhengruifeng/py_doc_max_by. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- R/pkg/R/functions.R | 6 ++++++ .../main/scala/org/apache/spark/sql/functions.scala | 8 ++++++++ python/pyspark/sql/functions/builtin.py | 10 ++++++++++ .../catalyst/expressions/aggregate/MaxByAndMinBy.scala | 8 ++++++++ .../main/scala/org/apache/spark/sql/functions.scala | 6 ++++++ 5 files changed, 38 insertions(+) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a7e337d3f9af2..b91124f96a6fa 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1558,6 +1558,9 @@ setMethod("max", #' @details #' \code{max_by}: Returns the value associated with the maximum value of ord. #' +#' Note: The function is non-deterministic so the output order can be different +#' for those associated the same values of `x`. +#' #' @rdname column_aggregate_functions #' @aliases max_by max_by,Column-method #' @note max_by since 3.3.0 @@ -1633,6 +1636,9 @@ setMethod("min", #' @details #' \code{min_by}: Returns the value associated with the minimum value of ord. #' +#' Note: The function is non-deterministic so the output order can be different +#' for those associated the same values of `x`. +#' #' @rdname column_aggregate_functions #' @aliases min_by min_by,Column-method #' @note min_by since 3.3.0 diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 92e7bc9da5904..81f25b3d743f0 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -884,6 +884,10 @@ object functions { /** * Aggregate function: returns the value associated with the maximum value of ord. * + * @note + * The function is non-deterministic so the output order can be different for those associated + * the same values of `e`. + * * @group agg_funcs * @since 3.4.0 */ @@ -932,6 +936,10 @@ object functions { /** * Aggregate function: returns the value associated with the minimum value of ord. * + * @note + * The function is non-deterministic so the output order can be different for those associated + * the same values of `e`. + * * @group agg_funcs * @since 3.4.0 */ diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 1ca522313f246..446ff2b1be93d 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -1271,6 +1271,11 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + Notes + ----- + The function is non-deterministic so the output order can be different for those + associated the same values of `col`. + Parameters ---------- col : :class:`~pyspark.sql.Column` or str @@ -1352,6 +1357,11 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + Notes + ----- + The function is non-deterministic so the output order can be different for those + associated the same values of `col`. + Parameters ---------- col : :class:`~pyspark.sql.Column` or str diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala index 56941c9de4510..b33142ed29cc5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala @@ -99,6 +99,10 @@ abstract class MaxMinBy extends DeclarativeAggregate with BinaryLike[Expression] > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y); b """, + note = """ + The function is non-deterministic so the output order can be different for + those associated the same values of `x`. + """, group = "agg_funcs", since = "3.0.0") case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy { @@ -122,6 +126,10 @@ case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMin > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y); a """, + note = """ + The function is non-deterministic so the output order can be different for + those associated the same values of `x`. + """, group = "agg_funcs", since = "3.0.0") case class MinBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 882918eb78c7f..5b4d27fc65d01 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -902,6 +902,9 @@ object functions { /** * Aggregate function: returns the value associated with the maximum value of ord. * + * @note The function is non-deterministic so the output order can be different for + * those associated the same values of `e`. + * * @group agg_funcs * @since 3.3.0 */ @@ -952,6 +955,9 @@ object functions { /** * Aggregate function: returns the value associated with the minimum value of ord. * + * @note The function is non-deterministic so the output order can be different for + * those associated the same values of `e`. + * * @group agg_funcs * @since 3.3.0 */