HyukjinKwon · HyukjinKwon · Jul 13, 2020 · Jul 13, 2020 · Jul 13, 2020 · Jul 13, 2020
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -75,156 +75,27 @@ jobs:
             excluded-tags: org.apache.spark.tags.ExtendedSQLTest
             comment: "- other tests"
     env:
-      TEST_ONLY_MODULES: ${{ matrix.modules }}
-      TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
-      TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
+      MODULES_TO_TEST: ${{ matrix.modules }}
+      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      INCLUDED_TAGS: ${{ matrix.included-tags }}
       HADOOP_PROFILE: ${{ matrix.hadoop }}
       HIVE_PROFILE: ${{ matrix.hive }}
       # GitHub Actions' default miniconda to use in pip packaging test.
       CONDA_PREFIX: /usr/share/miniconda
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
-    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
-    - name: Cache Scala, SBT, Maven and Zinc
-      uses: actions/cache@v1
-      with:
-        path: build
-        key: build-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          build-
-    - name: Cache Maven local repository
-      uses: actions/cache@v2
-      with:
-        path: ~/.m2/repository
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-
-    - name: Cache Ivy local repository
-      uses: actions/cache@v2
-      with:
-        path: ~/.ivy2/cache
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
-    - name: Install JDK ${{ matrix.java }}
-      uses: actions/setup-java@v1
-      with:
-        java-version: ${{ matrix.java }}
     # PySpark
-    - name: Install PyPy3
-      # SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
-      # Note that order of Python installations here matters because default python3 is
-      # overridden by pypy3.
-      uses: actions/setup-python@v2
-      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
-      with:
-        python-version: pypy3
-        architecture: x64
-    - name: Install Python 2.7
-      uses: actions/setup-python@v2
-      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
-      with:
-        python-version: 2.7
-        architecture: x64
     - name: Install Python 3.6
       uses: actions/setup-python@v2
       if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       with:
         python-version: 3.6
         architecture: x64
-    - name: Install Python packages
-      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
-      # PyArrow is not supported in PyPy yet, see ARROW-2651.
-      # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
-      run: |
-        python3 -m pip install numpy pyarrow pandas scipy
-        python3 -m pip list
-        python2 -m pip install numpy pyarrow pandas scipy
-        python2 -m pip list
-        pypy3 -m pip install numpy pandas
-        pypy3 -m pip list
-    # SparkR
-    - name: Install R 3.6
-      uses: r-lib/actions/setup-r@v1
-      if: contains(matrix.modules, 'sparkr')
-      with:
-        r-version: 3.6
-    - name: Install R packages
-      if: contains(matrix.modules, 'sparkr')
-      run: |
-        sudo apt-get install -y libcurl4-openssl-dev
-        sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
-        # Show installed packages in R.
-        sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
-    # Run the tests.
     - name: "Run tests: ${{ matrix.modules }}"
       run: |
         # Hive tests become flaky when running in parallel as it's too intensive.
-        if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
+        if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
         mkdir -p ~/.m2
-        ./dev/run-tests --parallelism 2
+        ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
         rm -rf ~/.m2/repository/org/apache/spark
-
-  # Static analysis, and documentation build
-  lint:
-    name: Linters, licenses, dependencies and documentation generation
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v2
-    - name: Cache Maven local repository
-      uses: actions/cache@v2
-      with:
-        path: ~/.m2/repository
-        key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          docs-maven-
-    - name: Install JDK 1.8
-      uses: actions/setup-java@v1
-      with:
-        java-version: 1.8
-    - name: Install Python 3.6
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.6
-        architecture: x64
-    - name: Install Python linter dependencies
-      run: |
-        pip3 install flake8 sphinx numpy
-    - name: Install R 3.6
-      uses: r-lib/actions/setup-r@v1
-      with:
-        r-version: 3.6
-    - name: Install R linter dependencies and SparkR
-      run: |
-        sudo apt-get install -y libcurl4-openssl-dev
-        sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
-        sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
-        ./R/install-dev.sh
-    - name: Install Ruby 2.7 for documentation generation
-      uses: actions/setup-ruby@v1
-      with:
-        ruby-version: 2.7
-    - name: Install dependencies for documentation generation
-      run: |
-        sudo apt-get install -y libcurl4-openssl-dev pandoc
-        pip install sphinx mkdocs numpy
-        gem install jekyll jekyll-redirect-from rouge
-        sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
-    - name: Scala linter
-      run: ./dev/lint-scala
-    - name: Java linter
-      run: ./dev/lint-java
-    - name: Python linter
-      run: ./dev/lint-python
-    - name: R linter
-      run: ./dev/lint-r
-    - name: License test
-      run: ./dev/check-license
-    - name: Dependencies test
-      run: ./dev/test-dependencies.sh
-    - name: Run documentation build
-      run: |
-        cd docs
-        jekyll build
diff --git a/R/README.md b/R/README.md
@@ -1,4 +1,4 @@
-# R on Spark
+# R on Spark.
 
 SparkR is an R package that provides a light-weight frontend to use Spark from R.
 

diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -79,17 +79,20 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
          identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
     True
     """
-    if target_branch is None and target_ref is None:
-        raise AttributeError("must specify either target_branch or target_ref")
-    elif target_branch is not None and target_ref is not None:
+    if target_branch is not None and target_ref is not None:
         raise AttributeError("must specify either target_branch or target_ref, not both")
     if target_branch is not None:
-        diff_target = target_branch
+        diff_target = [target_branch]
         run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
+    elif target_ref is not None:
+        diff_target = [target_ref]
     else:
-        diff_target = target_ref
-    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
-                                         universal_newlines=True)
+        # If both are not specified, just show the diff from the commit only.
+        diff_target = []
+    raw_output = subprocess.check_output(
+        ['git', 'diff', '--no-commit-id', '--name-only', patch_sha] + diff_target,
+        universal_newlines=True)
+    print(raw_output)
     # Remove any empty strings
     return [f for f in raw_output.split('\n') if f]
 
@@ -539,6 +542,24 @@ def parse_opts():
         "-p", "--parallelism", type=int, default=8,
         help="The number of suites to test in parallel (default %(default)d)"
     )
+    parser.add_argument(
+        "-m", "--modules", type=str,
+        default=None,
+        help="A comma-separated list of modules to test "
+             "(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules]))
+    )
+    parser.add_argument(
+        "-e", "--excluded-tags", type=str,
+        default=None,
+        help="A comma-separated list of tags to exclude in the tests, "
+             "e.g., org.apache.spark.tags.ExtendedHiveTest "
+    )
+    parser.add_argument(
+        "-i", "--included-tags", type=str,
+        default=None,
+        help="A comma-separated list of tags to include in the tests, "
+             "e.g., org.apache.spark.tags.ExtendedHiveTest "
+    )
 
     args, unknown = parser.parse_known_args()
     if unknown:
@@ -589,43 +610,63 @@ def main():
         # /home/jenkins/anaconda2/envs/py36/bin
         os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
     else:
-        # else we're running locally and can use local settings
+        # else we're running locally or Github Actions.
         build_tool = "sbt"
         hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
         hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
-        test_env = "local"
+        if "GITHUB_ACTIONS" in os.environ:
+            test_env = "github_actions"
+        else:
+            test_env = "local"
 
     print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
           "and Hive profile", hive_version, "under environment", test_env)
     extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)
 
     changed_modules = None
+    test_modules = None
     changed_files = None
-    should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
+    should_only_test_modules = opts.modules is not None
     included_tags = []
+    excluded_tags = []
     if should_only_test_modules:
-        str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
+        str_test_modules = [m.strip() for m in opts.modules.split(",")]
         test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
-        # Directly uses test_modules as changed modules to apply tags and environments
-        # as if all specified test modules are changed.
+
+        # If we're running the tests in Github Actions, attempt to detect and test
+        # only the affected modules.
+        if test_env == "github_actions":
+            changed_files = identify_changed_files_from_git_commits(
+                os.environ["GITHUB_SHA"], target_branch=os.environ.get("GITHUB_BASE_REF", None))
+            print("changed_files : %s" % changed_files)
+            test_modules = list(set(determine_modules_to_test(
+                determine_modules_for_files(changed_files))).intersection(test_modules))
+            print("test_modules : %s" % test_modules)
+
         changed_modules = test_modules
-        str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
-        str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
-        excluded_tags = []
-        if str_excluded_tags:
-            excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
-        included_tags = []
-        if str_included_tags:
-            included_tags = [t.strip() for t in str_included_tags.split(",")]
+
+    # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
+    # detect modules to test.
     elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
         target_branch = os.environ["ghprbTargetBranch"]
         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
         changed_modules = determine_modules_for_files(changed_files)
+        test_modules = determine_modules_to_test(changed_modules)
         excluded_tags = determine_tags_to_exclude(changed_modules)
 
+    # If there is no changed module found, tests all.
     if not changed_modules:
         changed_modules = [modules.root]
-        excluded_tags = []
+    if not test_modules:
+        test_modules = determine_modules_to_test(changed_modules)
+
+    str_excluded_tags = opts.excluded_tags
+    str_included_tags = opts.included_tags
+    if str_excluded_tags:
+        excluded_tags.extend([t.strip() for t in str_excluded_tags.split(",")])
+    if str_included_tags:
+        included_tags.extend([t.strip() for t in str_included_tags.split(",")])
+
     print("[info] Found the following changed modules:",
           ", ".join(x.name for x in changed_modules))
 
@@ -640,8 +681,6 @@ def main():
 
     should_run_java_style_checks = False
     if not should_only_test_modules:
-        test_modules = determine_modules_to_test(changed_modules)
-
         # license checks
         run_apache_rat_checks()
 
@@ -672,37 +711,44 @@ def main():
     # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
     #    build_spark_documentation()
 
-    if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
-        run_build_tests()
-
-    # spark build
-    build_apache_spark(build_tool, extra_profiles)
-
-    # backwards compatibility checks
-    if build_tool == "sbt":
-        # Note: compatibility tests only supported in sbt for now
-        detect_binary_inop_with_mima(extra_profiles)
-        # Since we did not build assembly/package before running dev/mima, we need to
-        # do it here because the tests still rely on it; see SPARK-13294 for details.
-        build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)
-
-    # run the test suites
-    run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)
-
-    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
-    if modules_with_python_tests:
-        # We only run PySpark tests with coverage report in one specific job with
-        # Spark master with SBT in Jenkins.
-        is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
-        run_python_tests(
-            modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
-        run_python_packaging_tests()
-    if any(m.should_run_r_tests for m in test_modules):
-        run_sparkr_tests()
+    print(changed_modules)
+    print(test_modules)
+    print([m for m in test_modules if m.python_test_goals])
+    print([m.should_run_r_tests for m in test_modules])
+    print(excluded_tags)
+    print(included_tags)
+
+    # if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
+    #     run_build_tests()
+    #
+    # # spark build
+    # build_apache_spark(build_tool, extra_profiles)
+    #
+    # # backwards compatibility checks
+    # if build_tool == "sbt":
+    #     # Note: compatibility tests only supported in sbt for now
+    #     detect_binary_inop_with_mima(extra_profiles)
+    #     # Since we did not build assembly/package before running dev/mima, we need to
+    #     # do it here because the tests still rely on it; see SPARK-13294 for details.
+    #     build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)
+    #
+    # # run the test suites
+    # run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)
+    #
+    # modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
+    # if modules_with_python_tests:
+    #     # We only run PySpark tests with coverage report in one specific job with
+    #     # Spark master with SBT in Jenkins.
+    #     is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
+    #     run_python_tests(
+    #         modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
+    #     run_python_packaging_tests()
+    # if any(m.should_run_r_tests for m in test_modules):
+    #     run_sparkr_tests()
 
 
 def _test():
-    if "TEST_ONLY_MODULES" in os.environ:
+    if "GITHUB_ACTIONS" in os.environ:
         # TODO(SPARK-32252): Enable doctests back in Github Actions.
         return
 

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -16,7 +16,7 @@
 #
 
 """
-Worker that receives input from Piped RDD.
+Worker that receives input from Piped RDD
 """
 from __future__ import print_function
 from __future__ import absolute_import