Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test only affected #7

Merged
merged 24 commits into from
Jul 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 5 additions & 134 deletions .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,156 +75,27 @@ jobs:
excluded-tags: org.apache.spark.tags.ExtendedSQLTest
comment: "- other tests"
env:
TEST_ONLY_MODULES: ${{ matrix.modules }}
TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
MODULES_TO_TEST: ${{ matrix.modules }}
EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
INCLUDED_TAGS: ${{ matrix.included-tags }}
HADOOP_PROFILE: ${{ matrix.hadoop }}
HIVE_PROFILE: ${{ matrix.hive }}
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT, Maven and Zinc
uses: actions/cache@v1
with:
path: build
key: build-${{ hashFiles('**/pom.xml') }}
restore-keys: |
build-
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-maven-
- name: Cache Ivy local repository
uses: actions/cache@v2
with:
path: ~/.ivy2/cache
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
- name: Install JDK ${{ matrix.java }}
uses: actions/setup-java@v1
with:
java-version: ${{ matrix.java }}
# PySpark
- name: Install PyPy3
# SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
# Note that order of Python installations here matters because default python3 is
# overridden by pypy3.
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: pypy3
architecture: x64
- name: Install Python 2.7
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 2.7
architecture: x64
- name: Install Python 3.6
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 3.6
architecture: x64
- name: Install Python packages
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
# PyArrow is not supported in PyPy yet, see ARROW-2651.
# TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
run: |
python3 -m pip install numpy pyarrow pandas scipy
python3 -m pip list
python2 -m pip install numpy pyarrow pandas scipy
python2 -m pip list
pypy3 -m pip install numpy pandas
pypy3 -m pip list
# SparkR
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
if: contains(matrix.modules, 'sparkr')
with:
r-version: 3.6
- name: Install R packages
if: contains(matrix.modules, 'sparkr')
run: |
sudo apt-get install -y libcurl4-openssl-dev
sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
# Show installed packages in R.
sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
# Run the tests.
- name: "Run tests: ${{ matrix.modules }}"
run: |
# Hive tests become flaky when running in parallel as it's too intensive.
if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
mkdir -p ~/.m2
./dev/run-tests --parallelism 2
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
rm -rf ~/.m2/repository/org/apache/spark

# Static analysis, and documentation build
lint:
name: Linters, licenses, dependencies and documentation generation
runs-on: ubuntu-latest
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
restore-keys: |
docs-maven-
- name: Install JDK 1.8
uses: actions/setup-java@v1
with:
java-version: 1.8
- name: Install Python 3.6
uses: actions/setup-python@v2
with:
python-version: 3.6
architecture: x64
- name: Install Python linter dependencies
run: |
pip3 install flake8 sphinx numpy
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
with:
r-version: 3.6
- name: Install R linter dependencies and SparkR
run: |
sudo apt-get install -y libcurl4-openssl-dev
sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
./R/install-dev.sh
- name: Install Ruby 2.7 for documentation generation
uses: actions/setup-ruby@v1
with:
ruby-version: 2.7
- name: Install dependencies for documentation generation
run: |
sudo apt-get install -y libcurl4-openssl-dev pandoc
pip install sphinx mkdocs numpy
gem install jekyll jekyll-redirect-from rouge
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
- name: Scala linter
run: ./dev/lint-scala
- name: Java linter
run: ./dev/lint-java
- name: Python linter
run: ./dev/lint-python
- name: R linter
run: ./dev/lint-r
- name: License test
run: ./dev/check-license
- name: Dependencies test
run: ./dev/test-dependencies.sh
- name: Run documentation build
run: |
cd docs
jekyll build
2 changes: 1 addition & 1 deletion R/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# R on Spark
# R on Spark.

SparkR is an R package that provides a light-weight frontend to use Spark from R.

Expand Down
150 changes: 98 additions & 52 deletions dev/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,20 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
True
"""
if target_branch is None and target_ref is None:
raise AttributeError("must specify either target_branch or target_ref")
elif target_branch is not None and target_ref is not None:
if target_branch is not None and target_ref is not None:
raise AttributeError("must specify either target_branch or target_ref, not both")
if target_branch is not None:
diff_target = target_branch
diff_target = [target_branch]
run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
elif target_ref is not None:
diff_target = [target_ref]
else:
diff_target = target_ref
raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
universal_newlines=True)
# If both are not specified, just show the diff from the commit only.
diff_target = []
raw_output = subprocess.check_output(
['git', 'diff', '--no-commit-id', '--name-only', patch_sha] + diff_target,
universal_newlines=True)
print(raw_output)
# Remove any empty strings
return [f for f in raw_output.split('\n') if f]

Expand Down Expand Up @@ -539,6 +542,24 @@ def parse_opts():
"-p", "--parallelism", type=int, default=8,
help="The number of suites to test in parallel (default %(default)d)"
)
parser.add_argument(
"-m", "--modules", type=str,
default=None,
help="A comma-separated list of modules to test "
"(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules]))
)
parser.add_argument(
"-e", "--excluded-tags", type=str,
default=None,
help="A comma-separated list of tags to exclude in the tests, "
"e.g., org.apache.spark.tags.ExtendedHiveTest "
)
parser.add_argument(
"-i", "--included-tags", type=str,
default=None,
help="A comma-separated list of tags to include in the tests, "
"e.g., org.apache.spark.tags.ExtendedHiveTest "
)

args, unknown = parser.parse_known_args()
if unknown:
Expand Down Expand Up @@ -589,43 +610,63 @@ def main():
# /home/jenkins/anaconda2/envs/py36/bin
os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
else:
# else we're running locally and can use local settings
# else we're running locally or Github Actions.
build_tool = "sbt"
hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
test_env = "local"
if "GITHUB_ACTIONS" in os.environ:
test_env = "github_actions"
else:
test_env = "local"

print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
"and Hive profile", hive_version, "under environment", test_env)
extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)

changed_modules = None
test_modules = None
changed_files = None
should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
should_only_test_modules = opts.modules is not None
included_tags = []
excluded_tags = []
if should_only_test_modules:
str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
str_test_modules = [m.strip() for m in opts.modules.split(",")]
test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
# Directly uses test_modules as changed modules to apply tags and environments
# as if all specified test modules are changed.

# If we're running the tests in Github Actions, attempt to detect and test
# only the affected modules.
if test_env == "github_actions":
changed_files = identify_changed_files_from_git_commits(
os.environ["GITHUB_SHA"], target_branch=os.environ.get("GITHUB_BASE_REF", None))
print("changed_files : %s" % changed_files)
test_modules = list(set(determine_modules_to_test(
determine_modules_for_files(changed_files))).intersection(test_modules))
print("test_modules : %s" % test_modules)

changed_modules = test_modules
str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
excluded_tags = []
if str_excluded_tags:
excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
included_tags = []
if str_included_tags:
included_tags = [t.strip() for t in str_included_tags.split(",")]

# If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
# detect modules to test.
elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
target_branch = os.environ["ghprbTargetBranch"]
changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
changed_modules = determine_modules_for_files(changed_files)
test_modules = determine_modules_to_test(changed_modules)
excluded_tags = determine_tags_to_exclude(changed_modules)

# If there is no changed module found, tests all.
if not changed_modules:
changed_modules = [modules.root]
excluded_tags = []
if not test_modules:
test_modules = determine_modules_to_test(changed_modules)

str_excluded_tags = opts.excluded_tags
str_included_tags = opts.included_tags
if str_excluded_tags:
excluded_tags.extend([t.strip() for t in str_excluded_tags.split(",")])
if str_included_tags:
included_tags.extend([t.strip() for t in str_included_tags.split(",")])

print("[info] Found the following changed modules:",
", ".join(x.name for x in changed_modules))

Expand All @@ -640,8 +681,6 @@ def main():

should_run_java_style_checks = False
if not should_only_test_modules:
test_modules = determine_modules_to_test(changed_modules)

# license checks
run_apache_rat_checks()

Expand Down Expand Up @@ -672,37 +711,44 @@ def main():
# if "DOCS" in changed_modules and test_env == "amplab_jenkins":
# build_spark_documentation()

if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
run_build_tests()

# spark build
build_apache_spark(build_tool, extra_profiles)

# backwards compatibility checks
if build_tool == "sbt":
# Note: compatibility tests only supported in sbt for now
detect_binary_inop_with_mima(extra_profiles)
# Since we did not build assembly/package before running dev/mima, we need to
# do it here because the tests still rely on it; see SPARK-13294 for details.
build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)

# run the test suites
run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)

modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
if modules_with_python_tests:
# We only run PySpark tests with coverage report in one specific job with
# Spark master with SBT in Jenkins.
is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
run_python_tests(
modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
run_python_packaging_tests()
if any(m.should_run_r_tests for m in test_modules):
run_sparkr_tests()
print(changed_modules)
print(test_modules)
print([m for m in test_modules if m.python_test_goals])
print([m.should_run_r_tests for m in test_modules])
print(excluded_tags)
print(included_tags)

# if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
# run_build_tests()
#
# # spark build
# build_apache_spark(build_tool, extra_profiles)
#
# # backwards compatibility checks
# if build_tool == "sbt":
# # Note: compatibility tests only supported in sbt for now
# detect_binary_inop_with_mima(extra_profiles)
# # Since we did not build assembly/package before running dev/mima, we need to
# # do it here because the tests still rely on it; see SPARK-13294 for details.
# build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)
#
# # run the test suites
# run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)
#
# modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
# if modules_with_python_tests:
# # We only run PySpark tests with coverage report in one specific job with
# # Spark master with SBT in Jenkins.
# is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
# run_python_tests(
# modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
# run_python_packaging_tests()
# if any(m.should_run_r_tests for m in test_modules):
# run_sparkr_tests()


def _test():
if "TEST_ONLY_MODULES" in os.environ:
if "GITHUB_ACTIONS" in os.environ:
# TODO(SPARK-32252): Enable doctests back in Github Actions.
return

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#

"""
Worker that receives input from Piped RDD.
Worker that receives input from Piped RDD
"""
from __future__ import print_function
from __future__ import absolute_import
Expand Down
Loading