diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a7e337d3f9af2..b91124f96a6fa 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1558,6 +1558,9 @@ setMethod("max", #' @details #' \code{max_by}: Returns the value associated with the maximum value of ord. #' +#' Note: The function is non-deterministic so the output order can be different +#' for those associated the same values of `x`. +#' #' @rdname column_aggregate_functions #' @aliases max_by max_by,Column-method #' @note max_by since 3.3.0 @@ -1633,6 +1636,9 @@ setMethod("min", #' @details #' \code{min_by}: Returns the value associated with the minimum value of ord. #' +#' Note: The function is non-deterministic so the output order can be different +#' for those associated the same values of `x`. +#' #' @rdname column_aggregate_functions #' @aliases min_by min_by,Column-method #' @note min_by since 3.3.0 diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 92e7bc9da5904..81f25b3d743f0 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -884,6 +884,10 @@ object functions { /** * Aggregate function: returns the value associated with the maximum value of ord. * + * @note + * The function is non-deterministic so the output order can be different for those associated + * the same values of `e`. + * * @group agg_funcs * @since 3.4.0 */ @@ -932,6 +936,10 @@ object functions { /** * Aggregate function: returns the value associated with the minimum value of ord. * + * @note + * The function is non-deterministic so the output order can be different for those associated + * the same values of `e`. + * * @group agg_funcs * @since 3.4.0 */ diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 1ca522313f246..446ff2b1be93d 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -1271,6 +1271,11 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + Notes + ----- + The function is non-deterministic so the output order can be different for those + associated the same values of `col`. + Parameters ---------- col : :class:`~pyspark.sql.Column` or str @@ -1352,6 +1357,11 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + Notes + ----- + The function is non-deterministic so the output order can be different for those + associated the same values of `col`. + Parameters ---------- col : :class:`~pyspark.sql.Column` or str diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala index 56941c9de4510..b33142ed29cc5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala @@ -99,6 +99,10 @@ abstract class MaxMinBy extends DeclarativeAggregate with BinaryLike[Expression] > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y); b """, + note = """ + The function is non-deterministic so the output order can be different for + those associated the same values of `x`. + """, group = "agg_funcs", since = "3.0.0") case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy { @@ -122,6 +126,10 @@ case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMin > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y); a """, + note = """ + The function is non-deterministic so the output order can be different for + those associated the same values of `x`. + """, group = "agg_funcs", since = "3.0.0") case class MinBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 882918eb78c7f..5b4d27fc65d01 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -902,6 +902,9 @@ object functions { /** * Aggregate function: returns the value associated with the maximum value of ord. * + * @note The function is non-deterministic so the output order can be different for + * those associated the same values of `e`. + * * @group agg_funcs * @since 3.3.0 */ @@ -952,6 +955,9 @@ object functions { /** * Aggregate function: returns the value associated with the minimum value of ord. * + * @note The function is non-deterministic so the output order can be different for + * those associated the same values of `e`. + * * @group agg_funcs * @since 3.3.0 */