Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run standalone tests in batches #13673

Merged
merged 17 commits into from Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
39 changes: 32 additions & 7 deletions tests/tests_pytorch/run_standalone_tests.sh
Expand Up @@ -18,7 +18,7 @@ set -e
# this environment variable allows special tests to run
export PL_RUN_STANDALONE_TESTS=1
# python arguments
defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no'
defaults='-m coverage run --source pytorch_lightning --append -m pytest --no-header'

# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
Expand All @@ -40,22 +40,47 @@ parametrizations_arr=($parametrizations)
# tests to skip - space separated
blocklist='profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx utilities/test_warnings.py'
report=''
test_batch_size=6

rm -f standalone_test_output.txt # in case it exists, remove it
function show_batched_output {
if [ -f standalone_test_output.txt ]; then # if exists
cat standalone_test_output.txt
rm standalone_test_output.txt
fi
}
trap show_batched_output EXIT # show the output on exit

for i in "${!parametrizations_arr[@]}"; do
parametrization=${parametrizations_arr[$i]}

# check blocklist
if echo $blocklist | grep -F "${parametrization}"; then
report+="Skipped\t$parametrization\n"
continue
# do not continue the loop because we might need to wait for batched jobs
else
echo "Running $parametrization"
# execute the test in the background
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
# output to std{out,err} because the outputs would be garbled together
python ${defaults} "$parametrization" &>> standalone_test_output.txt &
# save the PID in an array
pids[${i}]=$!
# add row to the final report
report+="Ran\t$parametrization\n"
fi

# run the test
echo "Running $parametrization"
python ${defaults} "$parametrization"

report+="Ran\t$parametrization\n"
if ((($i + 1) % $test_batch_size == 0)); then
# wait for running tests
for pid in ${pids[*]}; do wait $pid; done
carmocca marked this conversation as resolved.
Show resolved Hide resolved
unset pids # empty the array
show_batched_output
fi
done
# wait for leftover tests
for pid in ${pids[*]}; do wait $pid; done
show_batched_output
echo "Batched mode finished. Continuing with the rest of standalone tests."

if nvcc --version; then
nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
Expand Down
10 changes: 2 additions & 8 deletions tests/tests_pytorch/strategies/test_deepspeed_strategy.py
Expand Up @@ -26,7 +26,7 @@
from torch.utils.data import DataLoader
from torchmetrics import Accuracy

from pytorch_lightning import LightningDataModule, LightningModule, seed_everything, Trainer
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin
Expand Down Expand Up @@ -712,7 +712,6 @@ def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config
@pytest.mark.parametrize(("accumulate_grad_batches", "automatic_optimization"), [(1, False), (2, True)])
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches):
seed_everything(1)
if automatic_optimization:
model = ModelParallelClassificationModel()
else:
Expand All @@ -734,9 +733,7 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization
trainer.fit(model, datamodule=dm)

results = trainer.test(datamodule=dm)
assert results[0]["test_acc"] > 0.7
carmocca marked this conversation as resolved.
Show resolved Hide resolved
saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
assert saved_results[0]["test_acc"] > 0.7
assert saved_results == results

if automatic_optimization:
Expand All @@ -752,9 +749,7 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization
enable_progress_bar=False,
enable_model_summary=False,
)

results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
assert results[0]["test_acc"] > 0.7
trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)


@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
Expand Down Expand Up @@ -861,7 +856,6 @@ def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) ->
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
"""Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works."""
seed_everything(42)

class VerificationCallback(Callback):
def __init__(self):
Expand Down