Merge remote-tracking branch 'apache/master' into withOrigin-optimiza…

…tions
JoshRosen · Jun 6, 2024 · b463c3f · b463c3f
2 parents d0d8db9 + 0f21df0
commit b463c3f
Show file tree

Hide file tree

Showing 10,947 changed files with 1,121,889 additions and 282,714 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -31,3 +31,9 @@ github:
     merge: false
     squash: true
     rebase: true
+
+notifications:
+  pullrequests: reviews@spark.apache.org
+  issues: reviews@spark.apache.org
+  commits: commits@spark.apache.org
+  jira_options: link label
diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
@@ -9,7 +9,7 @@ Thanks for sending a pull request!  Here are some tips for you:
   7. If you want to add a new configuration, please read the guideline first for naming configurations in
      'core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala'.
   8. If you want to add or modify an error type or message, please read the guideline first in
-     'core/src/main/resources/error/README.md'.
+     'common/utils/src/main/resources/error/README.md'.
 -->
 
 ### What changes were proposed in this pull request?
@@ -47,3 +47,12 @@ If it was tested in a way different from regular unit tests, please clarify how
 If tests were not added, please describe why they were not added and/or why it was difficult to add.
 If benchmark tests were added, please run the benchmarks in GitHub Actions for the consistent environment, and the instructions could accord to: https://spark.apache.org/developer-tools.html#github-workflow-benchmarks.
 -->
+
+
+### Was this patch authored or co-authored using generative AI tooling?
+<!--
+If generative AI tooling has been used in the process of authoring this patch, please include the
+phrase: 'Generated-by: ' followed by the name of the tool and its version.
+If no, write 'No'.
+Please refer to the [ASF Generative Tooling Guidance](https://www.apache.org/legal/generative-tooling.html) for details.
+-->
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -17,136 +17,222 @@
 # under the License.
 #
 
-#
-# Pull Request Labeler Github Action Configuration: https://github.com/marketplace/actions/labeler
-#
-# Note that we currently cannot use the negatioon operator  (i.e. `!`)  for miniglob matches as they
-# would match any file that doesn't touch them. What's needed is the concept of `any `, which takes a
-# list of constraints / globs and then matches all of the constraints for either `any` of the files or
-# `all` of the files in the change set.
-#
-# However, `any`/`all` are not supported in a released version and testing off of the `main` branch
-# resulted in some other errors when testing.
-#
-# An issue has been opened upstream requesting that a release be cut that has support for all/any:
-#   - https://github.com/actions/labeler/issues/111
-#
-# While we wait for this issue to be handled upstream, we can remove
-# the negated / `!` matches for now and at least have labels again.
-#
 INFRA:
-  - ".github/**/*"
-  - "appveyor.yml"
-  - "tools/**/*"
-  - "dev/create-release/**/*"
-  - ".asf.yaml"
-  - ".gitattributes"
-  - ".gitignore"
-  - "dev/github_jira_sync.py"
-  - "dev/merge_spark_pr.py"
-  - "dev/run-tests-jenkins*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     '.github/**/*',
+     'tools/**/*',
+     'dev/create-release/**/*',
+     '.asf.yaml',
+     '.gitattributes',
+     '.gitignore',
+     'dev/merge_spark_pr.py',
+     'dev/run-tests-jenkins*'
+    ]
+
 BUILD:
- # Can be supported when a stable release with correct all/any is released
- #- any: ['dev/**/*', '!dev/github_jira_sync.py', '!dev/merge_spark_pr.py', '!dev/.rat-excludes']
- - "dev/**/*"
- - "build/**/*"
- - "project/**/*"
- - "assembly/**/*"
- - "**/*pom.xml"
- - "bin/docker-image-tool.sh"
- - "bin/find-spark-home*"
- - "scalastyle-config.xml"
- # These can be added in the above `any` clause (and the /dev/**/* glob removed) when
- # `any`/`all` support is released
- # - "!dev/github_jira_sync.py"
- # - "!dev/merge_spark_pr.py"
- # - "!dev/run-tests-jenkins*"
- # - "!dev/.rat-excludes"
+  - changed-files:
+    - all-globs-to-any-file: [
+     'dev/**/*',
+     '!dev/merge_spark_pr.py',
+     '!dev/run-tests-jenkins*'
+    ]
+    - any-glob-to-any-file: [
+     'build/**/*',
+     'project/**/*',
+     'assembly/**/*',
+     '**/*pom.xml',
+     'bin/docker-image-tool.sh',
+     'bin/find-spark-home*',
+     'scalastyle-config.xml'
+    ]
+
 DOCS:
-  - "docs/**/*"
-  - "**/README.md"
-  - "**/CONTRIBUTING.md"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'docs/**/*',
+     '**/README.md',
+     '**/CONTRIBUTING.md',
+     'python/docs/**/*'
+    ]
+
 EXAMPLES:
-  - "examples/**/*"
-  - "bin/run-example*"
-# CORE needs to be updated when all/any are released upstream.
+  - changed-files:
+    - any-glob-to-any-file: [
+     'examples/**/*',
+     'bin/run-example*'
+    ]
+
 CORE:
-  # - any: ["core/**/*", "!**/*UI.scala", "!**/ui/**/*"] # If any file matches all of the globs defined in the list started by `any`, label is applied.
-  - "core/**/*"
-  - "common/kvstore/**/*"
-  - "common/network-common/**/*"
-  - "common/network-shuffle/**/*"
-  - "python/pyspark/**/*.py"
-  - "python/pyspark/tests/**/*.py"
+  - changed-files:
+    - all-globs-to-any-file: [
+     'core/**/*',
+     '!**/*UI.scala',
+     '!**/ui/**/*'
+    ]
+    - any-glob-to-any-file: [
+     'common/kvstore/**/*',
+     'common/network-common/**/*',
+     'common/network-shuffle/**/*',
+     'python/pyspark/*.py',
+     'python/pyspark/tests/**/*.py'
+    ]
+
 SPARK SUBMIT:
-  - "bin/spark-submit*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'bin/spark-submit*'
+    ]
+
 SPARK SHELL:
-  - "repl/**/*"
-  - "bin/spark-shell*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'repl/**/*',
+     'bin/spark-shell*'
+    ]
+
 SQL:
-#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming.py", "!python/pyspark/sql/tests/test_streaming.py"]
-  - "**/sql/**/*"
-  - "common/unsafe/**/*"
-  #- "!python/pyspark/sql/avro/**/*"
-  #- "!python/pyspark/sql/streaming.py"
-  #- "!python/pyspark/sql/tests/test_streaming.py"
-  - "bin/spark-sql*"
-  - "bin/beeline*"
-  - "sbin/*thriftserver*.sh"
-  - "**/*SQL*.R"
-  - "**/DataFrame.R"
-  - "**/*WindowSpec.R"
-  - "**/*catalog.R"
-  - "**/*column.R"
-  - "**/*functions.R"
-  - "**/*group.R"
-  - "**/*schema.R"
-  - "**/*types.R"
+  - changed-files:
+    - all-globs-to-any-file: [
+     '**/sql/**/*',
+     '!python/pyspark/sql/avro/**/*',
+     '!python/pyspark/sql/streaming/**/*',
+     '!python/pyspark/sql/tests/streaming/test_streaming*.py'
+    ]
+    - any-glob-to-any-file: [
+     'common/unsafe/**/*',
+     'common/sketch/**/*',
+     'common/variant/**/*',
+     'bin/spark-sql*',
+     'bin/beeline*',
+     'sbin/*thriftserver*.sh',
+     '**/*SQL*.R',
+     '**/DataFrame.R',
+     '**/*WindowSpec.R',
+     '**/*catalog.R',
+     '**/*column.R',
+     '**/*functions.R',
+     '**/*group.R',
+     '**/*schema.R',
+     '**/*types.R'
+    ]
+
 AVRO:
-  - "external/avro/**/*"
-  - "python/pyspark/sql/avro/**/*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'connector/avro/**/*',
+     'python/pyspark/sql/avro/**/*'
+    ]
+
 DSTREAM:
-  - "streaming/**/*"
-  - "data/streaming/**/*"
-  - "external/kinesis*"
-  - "external/kafka*"
-  - "python/pyspark/streaming/**/*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'streaming/**/*',
+     'data/streaming/**/*',
+     'connector/kinesis-asl/**/*',
+     'connector/kinesis-asl-assembly/**/*',
+     'connector/kafka-0-10/**/*',
+     'connector/kafka-0-10-assembly/**/*',
+     'connector/kafka-0-10-token-provider/**/*',
+     'python/pyspark/streaming/**/*'
+    ]
+
 GRAPHX:
-  - "graphx/**/*"
-  - "data/graphx/**/*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'graphx/**/*',
+     'data/graphx/**/*'
+    ]
+
 ML:
-  - "**/ml/**/*"
-  - "**/*mllib_*.R"
+  - changed-files:
+    - any-glob-to-any-file: [
+     '**/ml/**/*',
+     '**/*mllib_*.R'
+    ]
+
 MLLIB:
-  - "**/spark/mllib/**/*"
-  - "mllib-local/**/*"
-  - "python/pyspark/mllib/**/*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     '**/spark/mllib/**/*',
+     'mllib-local/**/*',
+     'python/pyspark/mllib/**/*'
+    ]
+
 STRUCTURED STREAMING:
-  - "**/sql/**/streaming/**/*"
-  - "external/kafka-0-10-sql/**/*"
-  - "python/pyspark/sql/streaming.py"
-  - "python/pyspark/sql/tests/test_streaming.py"
-  - "**/*streaming.R"
+  - changed-files:
+    - any-glob-to-any-file: [
+     '**/sql/**/streaming/**/*',
+     'connector/kafka-0-10-sql/**/*',
+     'python/pyspark/sql/streaming/**/*',
+     'python/pyspark/sql/tests/streaming/test_streaming*.py',
+     '**/*streaming.R'
+    ]
+
 PYTHON:
-  - "bin/pyspark*"
-  - "**/python/**/*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'bin/pyspark*',
+     '**/python/**/*'
+    ]
+
+PANDAS API ON SPARK:
+  - changed-files:
+    - any-glob-to-any-file: [
+     'python/pyspark/pandas/**/*'
+    ]
+
 R:
-  - "**/r/**/*"
-  - "**/R/**/*"
-  - "bin/sparkR*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     '**/r/**/*',
+     '**/R/**/*',
+     'bin/sparkR*'
+    ]
+
 YARN:
-  - "resource-managers/yarn/**/*"
-MESOS:
-  - "resource-managers/mesos/**/*"
-  - "sbin/*mesos*.sh"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'resource-managers/yarn/**/*'
+    ]
+
 KUBERNETES:
-  - "resource-managers/kubernetes/**/*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'resource-managers/kubernetes/**/*'
+    ]
+
 WINDOWS:
-  - "**/*.cmd"
-  - "R/pkg/tests/fulltests/test_Windows.R"
+  - changed-files:
+    - any-glob-to-any-file: [
+     '**/*.cmd',
+     'R/pkg/tests/fulltests/test_Windows.R'
+    ]
+
 WEB UI:
-  - "**/ui/**/*"
-  - "**/*UI.scala"
+  - changed-files:
+    - any-glob-to-any-file: [
+     '**/ui/**/*',
+     '**/*UI.scala'
+    ]
+
 DEPLOY:
-  - "sbin/**/*"
+  - changed-files:
+    - any-glob-to-any-file: [
+     'sbin/**/*'
+    ]
+
+CONNECT:
+  - changed-files:
+    - any-glob-to-any-file: [
+     'connector/connect/**/*',
+     'python/pyspark/sql/**/connect/**/*',
+     'python/pyspark/ml/**/connect/**/*'
+    ]
 
+PROTOBUF:
+  - changed-files:
+    - any-glob-to-any-file: [
+     'connector/protobuf/**/*',
+     'python/pyspark/sql/protobuf/**/*'
+    ]