diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index e66d08400b46b..246427a87b918 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -857,10 +857,10 @@ def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameL ... "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]}) >>> df.groupby("A").sum().sort_index() - B C + B C D A - 1 1 6 - 2 1 8 + 1 1 6 ab + 2 1 8 aa >>> df.groupby("D").sum().sort_index() A B C @@ -900,17 +900,17 @@ def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameL unsupported = [ col.name for col in self._agg_columns - if not isinstance(col.spark.data_type, (NumericType, BooleanType)) + if not isinstance(col.spark.data_type, (NumericType, BooleanType, StringType)) ] if len(unsupported) > 0: log_advice( - "GroupBy.sum() can only support numeric and bool columns even if" + "GroupBy.sum() can only support numeric, bool and string columns even if" f"numeric_only=False, skip unsupported columns: {unsupported}" ) return self._reduce_for_stat_function( F.sum, - accepted_spark_types=(NumericType, BooleanType), + accepted_spark_types=(NumericType, BooleanType, StringType), bool_to_numeric=True, min_count=min_count, ) @@ -3534,7 +3534,21 @@ def _reduce_for_stat_function( for label in psdf._internal.column_labels: psser = psdf._psser_for(label) input_scol = psser._dtype_op.nan_to_null(psser).spark.column - output_scol = sfun(input_scol) + if sfun.__name__ == "sum" and isinstance( + psdf._internal.spark_type_for(label), StringType + ): + input_scol_name = psser._internal.data_spark_column_names[0] + # Sort data with natural order column to ensure order of data + sorted_array = F.array_sort( + F.collect_list(F.struct(NATURAL_ORDER_COLUMN_NAME, input_scol)) + ) + + # Using transform to extract strings + output_scol = F.concat_ws( + "", F.transform(sorted_array, lambda x: x.getField(input_scol_name)) + ) + else: + output_scol = sfun(input_scol) if min_count > 0: output_scol = F.when( @@ -3591,7 +3605,9 @@ def _prepare_reduce( ): agg_columns.append(psser) sdf = self._psdf._internal.spark_frame.select( - *groupkey_scols, *[psser.spark.column for psser in agg_columns] + *groupkey_scols, + *[psser.spark.column for psser in agg_columns], + NATURAL_ORDER_COLUMN_NAME, ) internal = InternalFrame( spark_frame=sdf, diff --git a/python/pyspark/pandas/tests/groupby/test_groupby.py b/python/pyspark/pandas/tests/groupby/test_groupby.py index 543ceff86256f..e162bed756bd6 100644 --- a/python/pyspark/pandas/tests/groupby/test_groupby.py +++ b/python/pyspark/pandas/tests/groupby/test_groupby.py @@ -59,9 +59,6 @@ def test_groupby_simple(self): }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9], ) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - # TODO(SPARK-43295): Make DataFrameGroupBy.sum support for string type columns - pdf = pdf[["a", "b", "c", "e"]] psdf = ps.from_pandas(pdf) for as_index in [True, False]: @@ -180,9 +177,6 @@ def sort(df): index=[0, 1, 3, 5, 6, 8, 9, 9, 9], ) psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - # TODO(SPARK-43295): Make DataFrameGroupBy.sum support for string type columns - pdf = pdf[[10, 20, 30]] for as_index in [True, False]: if as_index: diff --git a/python/pyspark/pandas/tests/groupby/test_stat.py b/python/pyspark/pandas/tests/groupby/test_stat.py index 44bb3b0070914..bc78e02c90e39 100644 --- a/python/pyspark/pandas/tests/groupby/test_stat.py +++ b/python/pyspark/pandas/tests/groupby/test_stat.py @@ -113,7 +113,7 @@ def test_basic_stat_funcs(self): # self._test_stat_func(lambda groupby_obj: groupby_obj.sum(), check_exact=False) self.assert_eq( psdf.groupby("A").sum().sort_index(), - pdf.groupby("A").sum(numeric_only=True).sort_index(), + pdf.groupby("A").sum().sort_index(), check_exact=False, ) diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py index 23d1b04dd3d33..5a8b1e3792016 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py @@ -66,7 +66,7 @@ def sort(df): self.assert_eq( sort(psdf1.groupby(psdf2.a, as_index=as_index).sum()), - sort(pdf1.groupby(pdf2.a, as_index=as_index).sum(numeric_only=True)), + sort(pdf1.groupby(pdf2.a, as_index=as_index).sum()), almost=as_index, ) @@ -93,7 +93,7 @@ def test_groupby_multiindex_columns(self): self.assert_eq( psdf1.groupby(psdf2[("x", "a")]).sum().sort_index(), - pdf1.groupby(pdf2[("x", "a")]).sum(numeric_only=True).sort_index(), + pdf1.groupby(pdf2[("x", "a")]).sum().sort_index(), ) self.assert_eq( @@ -102,7 +102,7 @@ def test_groupby_multiindex_columns(self): .sort_values(("y", "c")) .reset_index(drop=True), pdf1.groupby(pdf2[("x", "a")], as_index=False) - .sum(numeric_only=True) + .sum() .sort_values(("y", "c")) .reset_index(drop=True), )