diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index 01d16b95364bd..76bf460915421 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -18,7 +18,7 @@ set -e # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 # python arguments -defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no' +defaults='-m coverage run --source pytorch_lightning --append -m pytest --no-header' # find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py') @@ -40,6 +40,16 @@ parametrizations_arr=($parametrizations) # tests to skip - space separated blocklist='profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx utilities/test_warnings.py' report='' +test_batch_size=6 + +rm -f standalone_test_output.txt # in case it exists, remove it +function show_batched_output { + if [ -f standalone_test_output.txt ]; then # if exists + cat standalone_test_output.txt + rm standalone_test_output.txt + fi +} +trap show_batched_output EXIT # show the output on exit for i in "${!parametrizations_arr[@]}"; do parametrization=${parametrizations_arr[$i]} @@ -47,15 +57,30 @@ for i in "${!parametrizations_arr[@]}"; do # check blocklist if echo $blocklist | grep -F "${parametrization}"; then report+="Skipped\t$parametrization\n" - continue + # do not continue the loop because we might need to wait for batched jobs + else + echo "Running $parametrization" + # execute the test in the background + # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them + # output to std{out,err} because the outputs would be garbled together + python ${defaults} "$parametrization" &>> standalone_test_output.txt & + # save the PID in an array + pids[${i}]=$! + # add row to the final report + report+="Ran\t$parametrization\n" fi - # run the test - echo "Running $parametrization" - python ${defaults} "$parametrization" - - report+="Ran\t$parametrization\n" + if ((($i + 1) % $test_batch_size == 0)); then + # wait for running tests + for pid in ${pids[*]}; do wait $pid; done + unset pids # empty the array + show_batched_output + fi done +# wait for leftover tests +for pid in ${pids[*]}; do wait $pid; done +show_batched_output +echo "Batched mode finished. Continuing with the rest of standalone tests." if nvcc --version; then nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 41faee02f315d..d77319249b23d 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -26,7 +26,7 @@ from torch.utils.data import DataLoader from torchmetrics import Accuracy -from pytorch_lightning import LightningDataModule, LightningModule, seed_everything, Trainer +from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin @@ -712,7 +712,6 @@ def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config @pytest.mark.parametrize(("accumulate_grad_batches", "automatic_optimization"), [(1, False), (2, True)]) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches): - seed_everything(1) if automatic_optimization: model = ModelParallelClassificationModel() else: @@ -734,9 +733,7 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization trainer.fit(model, datamodule=dm) results = trainer.test(datamodule=dm) - assert results[0]["test_acc"] > 0.7 saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) - assert saved_results[0]["test_acc"] > 0.7 assert saved_results == results if automatic_optimization: @@ -752,9 +749,7 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization enable_progress_bar=False, enable_model_summary=False, ) - - results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path) - assert results[0]["test_acc"] > 0.7 + trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) @@ -861,7 +856,6 @@ def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): """Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.""" - seed_everything(42) class VerificationCallback(Callback): def __init__(self): @@ -1109,7 +1103,7 @@ def test_dataloader(self): @pytest.mark.parametrize("max_epoch", [2]) @pytest.mark.parametrize("limit_train_batches", [2]) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) -def test_scheduler_step_count(mock_step, max_epoch, limit_train_batches, interval): +def test_scheduler_step_count(mock_step, tmpdir, max_epoch, limit_train_batches, interval): """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is set to step or epoch.""" @@ -1124,7 +1118,7 @@ def configure_optimizers(self): model = TestModel() trainer = Trainer( - default_root_dir=os.getcwd(), + default_root_dir=tmpdir, limit_train_batches=limit_train_batches, limit_val_batches=0, max_epochs=max_epoch,