Skip to content

Commit

Permalink
Run standalone tests in batches (#13673)
Browse files Browse the repository at this point in the history
  • Loading branch information
carmocca committed Jul 18, 2022
1 parent 0449e86 commit d058190
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 17 deletions.
39 changes: 32 additions & 7 deletions tests/tests_pytorch/run_standalone_tests.sh
Expand Up @@ -18,7 +18,7 @@ set -e
# this environment variable allows special tests to run
export PL_RUN_STANDALONE_TESTS=1
# python arguments
defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no'
defaults='-m coverage run --source pytorch_lightning --append -m pytest --no-header'

# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
Expand All @@ -40,22 +40,47 @@ parametrizations_arr=($parametrizations)
# tests to skip - space separated
blocklist='profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx utilities/test_warnings.py'
report=''
test_batch_size=6

rm -f standalone_test_output.txt # in case it exists, remove it
function show_batched_output {
if [ -f standalone_test_output.txt ]; then # if exists
cat standalone_test_output.txt
rm standalone_test_output.txt
fi
}
trap show_batched_output EXIT # show the output on exit

for i in "${!parametrizations_arr[@]}"; do
parametrization=${parametrizations_arr[$i]}

# check blocklist
if echo $blocklist | grep -F "${parametrization}"; then
report+="Skipped\t$parametrization\n"
continue
# do not continue the loop because we might need to wait for batched jobs
else
echo "Running $parametrization"
# execute the test in the background
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
# output to std{out,err} because the outputs would be garbled together
python ${defaults} "$parametrization" &>> standalone_test_output.txt &
# save the PID in an array
pids[${i}]=$!
# add row to the final report
report+="Ran\t$parametrization\n"
fi

# run the test
echo "Running $parametrization"
python ${defaults} "$parametrization"

report+="Ran\t$parametrization\n"
if ((($i + 1) % $test_batch_size == 0)); then
# wait for running tests
for pid in ${pids[*]}; do wait $pid; done
unset pids # empty the array
show_batched_output
fi
done
# wait for leftover tests
for pid in ${pids[*]}; do wait $pid; done
show_batched_output
echo "Batched mode finished. Continuing with the rest of standalone tests."

if nvcc --version; then
nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
Expand Down
14 changes: 4 additions & 10 deletions tests/tests_pytorch/strategies/test_deepspeed_strategy.py
Expand Up @@ -26,7 +26,7 @@
from torch.utils.data import DataLoader
from torchmetrics import Accuracy

from pytorch_lightning import LightningDataModule, LightningModule, seed_everything, Trainer
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin
Expand Down Expand Up @@ -712,7 +712,6 @@ def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config
@pytest.mark.parametrize(("accumulate_grad_batches", "automatic_optimization"), [(1, False), (2, True)])
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches):
seed_everything(1)
if automatic_optimization:
model = ModelParallelClassificationModel()
else:
Expand All @@ -734,9 +733,7 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization
trainer.fit(model, datamodule=dm)

results = trainer.test(datamodule=dm)
assert results[0]["test_acc"] > 0.7
saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
assert saved_results[0]["test_acc"] > 0.7
assert saved_results == results

if automatic_optimization:
Expand All @@ -752,9 +749,7 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization
enable_progress_bar=False,
enable_model_summary=False,
)

results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
assert results[0]["test_acc"] > 0.7
trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)


@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
Expand Down Expand Up @@ -861,7 +856,6 @@ def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) ->
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
"""Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works."""
seed_everything(42)

class VerificationCallback(Callback):
def __init__(self):
Expand Down Expand Up @@ -1109,7 +1103,7 @@ def test_dataloader(self):
@pytest.mark.parametrize("max_epoch", [2])
@pytest.mark.parametrize("limit_train_batches", [2])
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
def test_scheduler_step_count(mock_step, max_epoch, limit_train_batches, interval):
def test_scheduler_step_count(mock_step, tmpdir, max_epoch, limit_train_batches, interval):
"""Test to ensure that the scheduler is called the correct amount of times during training when scheduler is
set to step or epoch."""

Expand All @@ -1124,7 +1118,7 @@ def configure_optimizers(self):

model = TestModel()
trainer = Trainer(
default_root_dir=os.getcwd(),
default_root_dir=tmpdir,
limit_train_batches=limit_train_batches,
limit_val_batches=0,
max_epochs=max_epoch,
Expand Down

0 comments on commit d058190

Please sign in to comment.