From 24e50061ff83f660b2f7f7c32e8f828e861ca561 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Mon, 13 Jul 2020 08:31:39 -0700 Subject: [PATCH] [SPARK-32292][SPARK-32252][INFRA] Run the relevant tests only in GitHub Actions This PR mainly proposes to run only relevant tests just like Jenkins PR builder does. Currently, GitHub Actions always run full tests which wastes the resources. In addition, this PR also fixes 3 more issues very closely related together while I am here. 1. The main idea here is: It reuses the existing logic embedded in `dev/run-tests.py` which Jenkins PR builder use in order to run only the related test cases. 2. While I am here, I fixed SPARK-32292 too to run the doc tests. It was because other references were not available when it is cloned via `checkoutv2`. With `fetch-depth: 0`, the history is available. 3. In addition, it fixes the `dev/run-tests.py` to match with `python/run-tests.py` in terms of its options. Environment variables such as `TEST_ONLY_XXX` were moved as proper options. For example, ```bash dev/run-tests.py --modules sql,core ``` which is consistent with `python/run-tests.py`, for example, ```bash python/run-tests.py --modules pyspark-core,pyspark-ml ``` 4. Lastly, also fixed the formatting issue in module specification in the matrix: ```diff - network_common, network_shuffle, repl, launcher + network-common, network-shuffle, repl, launcher, ``` which incorrectly runs build/test the modules. By running only related tests, we can hugely save the resources and avoid unrelated flaky tests, etc. Also, now it runs the doctest of `dev/run-tests.py` properly, the usages are similar between `dev/run-tests.py` and `python/run-tests.py`, and run `network-common`, `network-shuffle`, `launcher` and `examples` modules too. No, dev-only. Manually tested in my own forked Spark: https://github.com/HyukjinKwon/spark/pull/7 https://github.com/HyukjinKwon/spark/pull/8 https://github.com/HyukjinKwon/spark/pull/9 https://github.com/HyukjinKwon/spark/pull/10 https://github.com/HyukjinKwon/spark/pull/11 https://github.com/HyukjinKwon/spark/pull/12 Closes #29086 from HyukjinKwon/SPARK-32292. Authored-by: Hyukjin Kwon Signed-off-by: Dongjoon Hyun --- .github/workflows/master.yml | 16 +++-- dev/run-tests.py | 109 +++++++++++++++++++++++++++-------- 2 files changed, 96 insertions(+), 29 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 3396c981e0a1f..844f3e569f790 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -29,7 +29,7 @@ jobs: modules: - |- core, unsafe, kvstore, avro, - network_common, network_shuffle, repl, launcher + network-common, network-shuffle, repl, launcher, examples, sketch, graphx - |- catalyst, hive-thriftserver @@ -75,16 +75,20 @@ jobs: excluded-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- other tests" env: - TEST_ONLY_MODULES: ${{ matrix.modules }} - TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }} - TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }} + MODULES_TO_TEST: ${{ matrix.modules }} + EXCLUDED_TAGS: ${{ matrix.excluded-tags }} + INCLUDED_TAGS: ${{ matrix.included-tags }} HADOOP_PROFILE: ${{ matrix.hadoop }} HIVE_PROFILE: ${{ matrix.hive }} # GitHub Actions' default miniconda to use in pip packaging test. CONDA_PREFIX: /usr/share/miniconda + GITHUB_PREV_SHA: ${{ github.event.before }} steps: - name: Checkout Spark repository uses: actions/checkout@v2 + # In order to fetch changed files + with: + fetch-depth: 0 # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT, Maven and Zinc uses: actions/cache@v1 @@ -161,9 +165,9 @@ jobs: - name: "Run tests: ${{ matrix.modules }}" run: | # Hive tests become flaky when running in parallel as it's too intensive. - if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi + if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi mkdir -p ~/.m2 - ./dev/run-tests --parallelism 2 + ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" rm -rf ~/.m2/repository/org/apache/spark # Static analysis, and documentation build diff --git a/dev/run-tests.py b/dev/run-tests.py index 18afe59e30dc8..eb678c64657fa 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -101,12 +101,14 @@ def setup_test_environ(environ): os.environ[k] = v -def determine_modules_to_test(changed_modules): +def determine_modules_to_test(changed_modules, deduplicated=True): """ Given a set of modules that have changed, compute the transitive closure of those modules' dependent modules in order to determine the set of modules that should be tested. Returns a topologically-sorted list of modules (ties are broken by sorting on module names). + If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication + by dependencies into account. >>> [x.name for x in determine_modules_to_test([modules.root])] ['root'] @@ -122,11 +124,29 @@ def determine_modules_to_test(changed_modules): ... # doctest: +NORMALIZE_WHITESPACE ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml'] + >>> sorted([x.name for x in determine_modules_to_test( + ... [modules.sparkr, modules.sql], deduplicated=False)]) + ... # doctest: +NORMALIZE_WHITESPACE + ['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml', + 'pyspark-mllib', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] + >>> sorted([x.name for x in determine_modules_to_test( + ... [modules.sql, modules.core], deduplicated=False)]) + ... # doctest: +NORMALIZE_WHITESPACE + ['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver', + 'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', + 'pyspark-sql', 'pyspark-streaming', 'repl', 'root', + 'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', + 'streaming-kinesis-asl'] """ modules_to_test = set() for module in changed_modules: - modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) + modules_to_test = modules_to_test.union( + determine_modules_to_test(module.dependent_modules, deduplicated)) modules_to_test = modules_to_test.union(set(changed_modules)) + + if not deduplicated: + return modules_to_test + # If we need to run all of the tests, then we should short-circuit and return 'root' if modules.root in modules_to_test: return [modules.root] @@ -538,6 +558,24 @@ def parse_opts(): "-p", "--parallelism", type=int, default=8, help="The number of suites to test in parallel (default %(default)d)" ) + parser.add_argument( + "-m", "--modules", type=str, + default=None, + help="A comma-separated list of modules to test " + "(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules])) + ) + parser.add_argument( + "-e", "--excluded-tags", type=str, + default=None, + help="A comma-separated list of tags to exclude in the tests, " + "e.g., org.apache.spark.tags.ExtendedHiveTest " + ) + parser.add_argument( + "-i", "--included-tags", type=str, + default=None, + help="A comma-separated list of tags to include in the tests, " + "e.g., org.apache.spark.tags.ExtendedHiveTest " + ) args, unknown = parser.parse_known_args() if unknown: @@ -588,43 +626,74 @@ def main(): # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: - # else we're running locally and can use local settings + # else we're running locally or Github Actions. build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") - test_env = "local" + if "GITHUB_ACTIONS" in os.environ: + test_env = "github_actions" + else: + test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "and Hive profile", hive_version, "under environment", test_env) extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) changed_modules = None + test_modules = None changed_files = None - should_only_test_modules = "TEST_ONLY_MODULES" in os.environ + should_only_test_modules = opts.modules is not None included_tags = [] + excluded_tags = [] if should_only_test_modules: - str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")] + str_test_modules = [m.strip() for m in opts.modules.split(",")] test_modules = [m for m in modules.all_modules if m.name in str_test_modules] - # Directly uses test_modules as changed modules to apply tags and environments - # as if all specified test modules are changed. + + # If we're running the tests in Github Actions, attempt to detect and test + # only the affected modules. + if test_env == "github_actions": + if os.environ["GITHUB_BASE_REF"] != "": + # Pull requests + changed_files = identify_changed_files_from_git_commits( + os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"]) + else: + # Build for each commit. + changed_files = identify_changed_files_from_git_commits( + os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"]) + + modules_to_test = determine_modules_to_test( + determine_modules_for_files(changed_files), deduplicated=False) + + if modules.root not in modules_to_test: + # If root module is not found, only test the intersected modules. + # If root module is found, just run the modules as specified initially. + test_modules = list(set(modules_to_test).intersection(test_modules)) + changed_modules = test_modules - str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None) - str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None) - excluded_tags = [] - if str_excluded_tags: - excluded_tags = [t.strip() for t in str_excluded_tags.split(",")] - included_tags = [] - if str_included_tags: - included_tags = [t.strip() for t in str_included_tags.split(",")] + if len(changed_modules) == 0: + print("[info] There are no modules to test, exiting without testing.") + return + + # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and + # detect modules to test. elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) + test_modules = determine_modules_to_test(changed_modules) excluded_tags = determine_tags_to_exclude(changed_modules) + # If there is no changed module found, tests all. if not changed_modules: changed_modules = [modules.root] - excluded_tags = [] + if not test_modules: + test_modules = determine_modules_to_test(changed_modules) + + if opts.excluded_tags: + excluded_tags.extend([t.strip() for t in opts.excluded_tags.split(",")]) + if opts.included_tags: + included_tags.extend([t.strip() for t in opts.included_tags.split(",")]) + print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) @@ -639,8 +708,6 @@ def main(): should_run_java_style_checks = False if not should_only_test_modules: - test_modules = determine_modules_to_test(changed_modules) - # license checks run_apache_rat_checks() @@ -701,10 +768,6 @@ def main(): def _test(): - if "TEST_ONLY_MODULES" in os.environ: - # TODO(SPARK-32252): Enable doctests back in Github Actions. - return - import doctest failure_count = doctest.testmod()[0] if failure_count: