diff --git a/.flake8 b/.flake8 index 3837b2b01b..6efeae09f5 100644 --- a/.flake8 +++ b/.flake8 @@ -43,6 +43,8 @@ per-file-ignores = libensemble/gen_funcs/persistent_aposmm.py:E402, E501 libensemble/tests/regression_tests/test_persistent_aposmm*:E402 libensemble/tests/regression_tests/dont_run_test_persistent_aposmm*:E402 + libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py:E402 + libensemble/tests/regression_tests/dontrun_test_persistent_gp_multitask_ax.py:E402 libensemble/tests/regression_tests/test_uniform_sampling_then_persistent_localopt_runs.py:E402 libensemble/tests/functionality_tests/test_active_persistent_worker_abort.py:E402 libensemble/tests/deprecated_tests/test_old_aposmm*:E402 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1e209ee1c5..9161f5375c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -42,6 +42,17 @@ jobs: comms-type: l mpi-version: 'mpich' do-balsam: true + - os: windows-latest + python-version: '3.10' + comms-type: l + mpi-version: 'msmpi' + do-balsam: false + - os: windows-latest + python-version: '3.10' + comms-type: m + mpi-version: 'msmpi' + do-balsam: false + env: HYDRA_LAUNCHER: 'fork' TERM: xterm-256color @@ -68,6 +79,13 @@ jobs: python --version pip install -I --upgrade certifi + - name: Windows - Add clang path to $PATH env + shell: bash + if: matrix.os == 'windows-latest' + run: | + echo "PATH=$PATH:C:\msys64\mingw64\bin" >> $GITHUB_ENV + echo "PATH=$PATH:C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\amd64" >> $GITHUB_ENV + - name: Install Ubuntu compilers if: matrix.os == 'ubuntu-latest' run: conda install gcc_linux-64 @@ -90,20 +108,30 @@ jobs: mkdir ../sdk; tar xf MacOSX10.14.sdk.tar.xz -C ../sdk conda install clang_osx-64=9.0.1 - - name: Install MPI, mpi4py from conda - if: matrix.python-version != '3.10' + - name: Setup MPI (${{ matrix.mpi-version }}) + uses: mpi4py/setup-mpi@v1 + if: matrix.os == 'windows-latest' + with: + mpi: ${{ matrix.mpi-version }} + + - name: Install mpi4py on Windows + if: matrix.os == 'windows-latest' + run: pip install mpi4py + + - name: Install mpi4py and MPI from conda + if: matrix.python-version != '3.10' && matrix.os != 'windows-latest' run: | conda install ${{ matrix.mpi-version }} conda install mpi4py - - name: Install MPI, mpi4py from pip - if: matrix.python-version == '3.10' + - name: Install mpi4py from pip, MPI from conda + if: matrix.python-version == '3.10' && matrix.os != 'windows-latest' run: | conda install ${{ matrix.mpi-version }} pip install mpi4py - name: Install generator dependencies - if: contains('3.7_3.8_3.9_3.10', matrix.python-version) && matrix.do-balsam == false + if: contains('3.7_3.8_3.9_3.10', matrix.python-version) && matrix.do-balsam == false && matrix.os != 'windows-latest' run: | python -m pip install --upgrade pip conda install nlopt @@ -111,13 +139,22 @@ jobs: conda install superlu_dist conda install hypre conda install mumps-mpi - # pip install petsc - # pip install petsc4py + conda install petsc + conda install petsc4py pip install DFO-LS pip install mpmath + pip install ax-platform python -m pip install --upgrade git+https://github.com/mosesyhc/surmise.git@development/PCGPwM + - name: Install some generator dependencies on Windows + if: matrix.os == 'windows-latest' + run: | + python -m pip install --upgrade pip + conda install nlopt + conda install scipy + pip install mpmath + - name: Install generator dependencies for Ubuntu tests if: matrix.os == 'ubuntu-latest' && matrix.do-balsam == false run: | @@ -132,21 +169,24 @@ jobs: cd heffte/build pwd cmake -D CMAKE_BUILD_TYPE=Release -D BUILD_SHARED_LIBS=ON -D CMAKE_INSTALL_PREFIX=./ -D Heffte_ENABLE_AVX=ON -D Heffte_ENABLE_FFTW=ON ../ - make + make -j 4 make install cp ./benchmarks/speed3d_c2c ../../libensemble/tests/regression_tests/ # end heffte build and dependencies - pip install dragonfly-opt - pip install torch - pip install gpytorch + # pip install dragonfly-opt + pip install git+https://github.com/dragonfly/dragonfly.git + pip install ax-platform - name: Install other testing dependencies + if: matrix.do-balsam == false run: | pip install -r install/testing_requirements.txt pip install psutil pip install pyyaml + pip install funcx + pip install balsam - name: Install Tasmanian on Ubuntu if: matrix.os == 'ubuntu-latest' && matrix.do-balsam == false @@ -158,6 +198,7 @@ jobs: env: BALSAM_DB_PATH: $HOME/test-balsam run: | + pip install -r install/testing_requirements.txt wget https://github.com/argonne-lcf/balsam/archive/refs/tags/0.5.0.tar.gz mkdir ../balsam; tar xf 0.5.0.tar.gz -C ../balsam; python install/configure_balsam_install.py @@ -176,11 +217,8 @@ jobs: - name: Find MPI, Install libEnsemble, flake8, ulimit adjust run: | - python install/find_mpi.py - mpiexec --version pip install -e . flake8 libensemble - ulimit -Sn 10000 - name: Activate API unit test if using mpich if: matrix.mpi-version == 'mpich' @@ -192,10 +230,10 @@ jobs: run: | ./libensemble/tests/run-tests.sh -e -A "-W error" -z -${{ matrix.comms-type }} - #- name: Run simple tests, Ubuntu, Python 3.11 - # if: matrix.python-version == '3.11' - # run: | - # ./libensemble/tests/run-tests.sh -A "-W error" -z -${{ matrix.comms-type }} + - name: Run simple tests, Windows + if: matrix.os == 'windows-latest' + run: | + ./libensemble/tests/run-tests.sh -A "-W error" -z -${{ matrix.comms-type }} - name: Run extensive tests, macOS if: matrix.os == 'macos-latest' diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 815baa34bc..de3e46c31b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -19,9 +19,9 @@ New capabilities: * Added configuration options for `libE_stats.txt` file. #743 * Support for `spawn` and `forkserver` multiprocessing start methods. #797 - * Note that macOS no longer switches to using `fork`. macOS (since Python 3.8) and Windows - default to using `spawn`. When using `spawn`, we recommend placing calling script code in - an ``if __name__ == "__main__":`` block. The multiprocessing interface can be used to switch methods (https://docs.python.org/3/library/multiprocessing.html#multiprocessing.set_start_method). + * Note that macOS no longer switches to using `fork`. macOS (since Python 3.8) and Windows default to + using `spawn`. When using `spawn`, we recommend placing calling script code in an ``if __name__ == "__main__":`` block. + The multiprocessing interface can be used to switch methods (https://docs.python.org/3/library/multiprocessing.html#multiprocessing.set_start_method). Updates to example functions: diff --git a/README.rst b/README.rst index c09becb1be..dd16366995 100644 --- a/README.rst +++ b/README.rst @@ -31,8 +31,8 @@ Introduction to libEnsemble libEnsemble is a Python_ toolkit for coordinating workflows of asynchronous and dynamic ensembles of calculations. -libEnsemble can help users take advantage of massively parallel resources to solve design, -decision, and inference problems and expand the class of problems that can benefit from +libEnsemble helps users take advantage of massively parallel resources to solve design, +decision, and inference problems and expands the class of problems that can benefit from increased parallelism. libEnsemble aims for: @@ -58,7 +58,7 @@ With a basic familiarity of Python and NumPy_, users can easily incorporate any other mathematics, machine-learning, or resource-management libraries into libEnsemble workflows. -libEnsemble employs a manager/worker scheme that runs on MPI, multiprocessing, +libEnsemble employs a manager/worker scheme that communicates via MPI, multiprocessing, or TCP. Workers control and monitor any level of work using the aforementioned generator and simulator functions, from small subnode tasks to huge many-node computations. @@ -66,12 +66,15 @@ libEnsemble includes an Executor interface so application-launching functions ar portable, resilient, and flexible; it also automatically detects available nodes and cores, and can dynamically assign resources to workers. +libEnsemble performs best on Unix-like systems like Linux and macOS. See the +:ref:`FAQ` for more information. + .. before_dependencies_rst_tag Dependencies ~~~~~~~~~~~~ -Required dependencies: +**Required dependencies**: * Python_ 3.7 or above * NumPy_ @@ -83,7 +86,7 @@ When using ``mpi4py`` for libEnsemble communications: * A functional MPI 1.x/2.x/3.x implementation, such as MPICH_, built with shared/dynamic libraries * mpi4py_ v2.0.0 or above -Optional dependencies: +**Optional dependencies**: * Balsam_ @@ -103,6 +106,12 @@ a function-as-a-service platform to which workers can submit remote generator or simulator function instances. This feature can help distribute an ensemble across systems and heterogeneous resources. +* `psi-j-python`_ + +As of v0.9.2+dev, libEnsemble features a set of command-line utilities for submitting +libEnsemble jobs to almost any system or scheduler via a `PSI/J`_ Python interface. tqdm_ +is also required. + The example simulation and generation functions and tests require the following: * SciPy_ @@ -229,7 +238,8 @@ Resources **Further Information:** - Documentation is provided by ReadtheDocs_. -- An overview of libEnsemble's structure and capabilities is given in this manuscript_ and poster_ +- Contributions_ to libEnsemble are welcome. +- An overview of libEnsemble's structure and capabilities is given in this manuscript_ and poster_. - Examples of production user functions and complete workflows can be viewed, downloaded, and contributed to in the libEnsemble `Community Examples repository`_. **Citation:** @@ -243,7 +253,7 @@ Resources author = {Stephen Hudson and Jeffrey Larson and Stefan M. Wild and David Bindel and John-Luke Navarro}, institution = {Argonne National Laboratory}, - number = {Revision 0.9.2}, + number = {Revision 0.9.2+dev}, year = {2022}, url = {https://buildmedia.readthedocs.org/media/pdf/libensemble/latest/libensemble.pdf} } @@ -305,6 +315,7 @@ See a complete list of `example user scripts`_. .. _Community Examples repository: https://github.com/Libensemble/libe-community-examples .. _Conda: https://docs.conda.io/en/latest/ .. _conda-forge: https://conda-forge.org/ +.. _Contributions: https://github.com/Libensemble/libensemble/blob/main/CONTRIBUTING.rst .. _Coveralls: https://coveralls.io/github/Libensemble/libensemble?branch=main .. _DEAP: https://deap.readthedocs.io/en/master/overview.html .. _DFO-LS: https://github.com/numericalalgorithmsgroup/dfols @@ -330,6 +341,8 @@ See a complete list of `example user scripts`_. .. _petsc4py: https://bitbucket.org/petsc/petsc4py .. _PETSc/TAO: http://www.mcs.anl.gov/petsc .. _poster: https://figshare.com/articles/libEnsemble_A_Python_Library_for_Dynamic_Ensemble-Based_Computations/12559520 +.. _PSI/J: https://exaworks.org/psij +.. _psi-j-python: https://github.com/ExaWorks/psi-j-python .. _psutil: https://pypi.org/project/psutil/ .. _PyPI: https://pypi.org .. _pytest-cov: https://pypi.org/project/pytest-cov/ @@ -348,6 +361,7 @@ See a complete list of `example user scripts`_. .. _tarball: https://github.com/Libensemble/libensemble/releases/latest .. _Tasmanian: https://tasmanian.ornl.gov/ .. _Theta: https://www.alcf.anl.gov/alcf-resources/theta +.. _tqdm: https://tqdm.github.io/ .. _user guide: https://libensemble.readthedocs.io/en/latest/programming_libE.html .. _VTMOP: https://github.com/Libensemble/libe-community-examples#vtmop .. _WarpX: https://warpx.readthedocs.io/en/latest/ diff --git a/docs/FAQ.rst b/docs/FAQ.rst index da70c7f5bd..e29896c241 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -221,16 +221,67 @@ This effectively puts libEnsemble in silent mode. See the :ref:`Logger Configuration` docs for more information. -macOS-Specific Errors ---------------------- +macOS and Windows Errors +------------------------ + +.. _faqwindows: + +**Can I run libEnsemble on Windows** + +Although we run many libEnsemble workflows successfully on Windows using both MPI and local comms, we do not +rigorously support Windows, and recommend unix-like systems as a preference. Windows tends to produce more +platform-specific issues that are difficult to reproduce and troubleshoot. + +Feel free to check our `Github Actions`_ page to see what tests we run regularly on Windows. + +.. _`Github Actions`: https://github.com/Libensemble/libensemble/actions + +**Windows - How can I run libEnsemble with MPI comms?** + +We have run workflows with MPI comms. However, as most MPI distributions have either dropped Windows support + (MPICH and Open MPI) or are no longer being maintained (``msmpi``), we cannot guarantee success. + +If users wish to try, we recommend experimenting with the many Unix-like emulators, containers, virtual machines, +and other such systems. The `Installing PETSc On Microsoft Windows`_ documentation contains valuable information. + +Otherwise, install ``msmpi`` and ``mpi4py`` from conda and experiment, or use ``local`` comms. + +.. _`Installing PETSc On Microsoft Windows`: https://petsc.org/release/install/windows/#recommended-installation-methods + +**Windows - 'A required privilege is not held by the client'** + +Assuming you were trying to use the ``sim_dir_symlink_files`` or ``gen_dir_symlink_files`` options, this indicates that to +allow libEnsemble to create symlinks, you need to run your current ``cmd`` shell as administrator. + +**"RuntimeError: An attempt has been made to start a new process... this probably means that you are not using fork... +" if __name__ == '__main__': freeze_support() ...** + +You need to place your main calling script code underneath an ``if __name__ == "__main__":`` block. + +Explanation: Python chooses one of three methods to start new processes when using multiprocessing +(``--comms local`` with libEnsemble). These are ``'fork'``, ``'spawn'``, and ``'forkserver'``. ``'fork'`` +is the default on Unix, and in our experience is quicker and more reliable, but ``'spawn'`` is the default +on Windows and macOS (See the `Python multiprocessing docs`_). + +Prior to libEnsemble v0.9.2, if libEnsemble detected macOS, it would automatically switch the multiprocessing +method to ``'fork'``. We decided to stop doing this to avoid overriding defaults and compatibility issues with +some libraries. + +If you'd prefer to use ``'fork'`` or not reformat your code, you can set the multiprocessing start method via +the following, placed near the top of your calling script:: + + import multiprocessing + multiprocessing.set_start_method('fork', force=True) + +.. _`Python multiprocessing docs`: https://docs.python.org/3/library/multiprocessing.html -**"Fatal error in MPI_Init_thread: Other MPI error, error stack: ... gethostbyname failed"** +**"macOS - Fatal error in MPI_Init_thread: Other MPI error, error stack: ... gethostbyname failed"** Resolve this by appending ``127.0.0.1 [your hostname]`` to /etc/hosts. Unfortunately, ``127.0.0.1 localhost`` isn't satisfactory for preventing this error. -**How do I stop the Firewall Security popups when running with the Executor?** +**macOS - How do I stop the Firewall Security popups when running with the Executor?** There are several ways to address this nuisance, but all involve trial and error. An easy (but insecure) solution is temporarily disabling the firewall through diff --git a/docs/conf.py b/docs/conf.py index 76c7028e92..6972705401 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,6 +79,7 @@ def __getattr__(cls, name): sys.path.append(os.path.abspath("../libensemble/utils")) sys.path.append(os.path.abspath("../libensemble/tools")) sys.path.append(os.path.abspath("../libensemble/executors")) +sys.path.append(os.path.abspath("../libensemble/executors/balsam_executors")) sys.path.append(os.path.abspath("../libensemble/resources")) # print(sys.path) diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index f31cabb0a1..85af8594f6 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -33,7 +33,7 @@ libEnsemble is primarily customized by setting options within a ``libE_specs`` d processes are then terminated. multiprocessing default: 1 'kill_canceled_sims' [bool]: Will libE try to kill sims that user functions mark 'cancel_requested' as True. - If False, the manager avoid this moderate overhead. + If False, the manager avoids this moderate overhead. Default: True Directory management options: @@ -147,7 +147,9 @@ libEnsemble is primarily customized by setting options within a ``libE_specs`` d Distributed mode means workers share nodes with applications. Default: False 'zero_resource_workers' [list of ints]: - List of workers that require no resources. + List of workers that require no resources. For when a fixed mapping of workers + to resources is required. Otherwise, use "num_resource_sets". + For use with supported allocation functions. 'resource_info' [dict]: Provide resource information that will override automatically detected resources. The allowable fields are given below in 'Overriding Auto-detection' diff --git a/docs/data_structures/work_dict.rst b/docs/data_structures/work_dict.rst index 35af11cdff..64a479c772 100644 --- a/docs/data_structures/work_dict.rst +++ b/docs/data_structures/work_dict.rst @@ -3,27 +3,33 @@ work dictionary =============== -The work dictionary contains integer keys ``i`` and dictionary values to be -given to worker ``i``. ``Work[i]`` has the following form:: +The work dictionary contains metadata that is used by the manager to send a packet +of work to a worker. The dictionary uses integer keys ``i`` and values that determine +the data given to worker ``i``. ``Work[i]`` has the following form:: Work[i]: [dict]: Required keys: - 'persis_info' [dict]: Any persistent info to be sent to worker 'i' 'H_fields' [list]: The field names of the history 'H' to be sent to worker 'i' + 'persis_info' [dict]: Any persistent info to be sent to worker 'i' 'tag' [int]: 'EVAL_SIM_TAG'/'EVAL_GEN_TAG' if worker 'i' is to call sim/gen_func - 'libE_info' [dict]: Info sent to/from worker to help manager update the 'H' + 'libE_info' [dict]: Info sent to/from worker to help manager update the 'H' array - Optional keys are: + libE_info contains the following: 'H_rows' [list of ints]: History rows to send to worker 'i' - 'blocking' [list of ints]: Workers to be blocked by this calculation - 'persistent' [bool]: True if worker 'i' will enter persistent mode + 'rset_team' [list of ints]: The resource sets to be assigned (if dynamic scheduling is used) + 'persistent' [bool]: True if worker 'i' will enter persistent mode (Default: False) + +The work dictionary is typically set using the ``gen_work`` or ``sim_work`` +:doc:`helper functions<../function_guides/allocator>` in the allocation function. +``H_fields``, for example, is usually packed from either ``sim_specs["in"]``, ``gen_specs["in"]`` +or the equivalent "persis_in" variants. .. seealso:: For allocation functions giving work dictionaries using persistent workers, see `start_only_persistent.py`_ or `start_persistent_local_opt_gens.py`_. For a use case where the allocation and generator functions combine to do - simulation evaluations with different resources (blocking some workers), see + simulation evaluations with different resources, see `test_uniform_sampling_with_variable_resources.py`_. .. _start_only_persistent.py: https://github.com/Libensemble/libensemble/blob/develop/libensemble/alloc_funcs/start_only_persistent.py diff --git a/docs/executor/executor.rst b/docs/executor/executor.rst index 6d12a387a0..9d9dd6520d 100644 --- a/docs/executor/executor.rst +++ b/docs/executor/executor.rst @@ -24,7 +24,10 @@ To run MPI applications and use detected resources, use an alternative Executor class, as shown above. .. autoclass:: Executor - :members: __init__, register_app, submit + :members: + :exclude-members: serial_setup, sim_default_app, gen_default_app, get_app, default_app, set_resources, get_task, set_workerID, set_worker_info, new_tasks_timing + + .. automethod:: __init__ .. _task_tag: diff --git a/docs/executor/mpi_executor.rst b/docs/executor/mpi_executor.rst index 60fc1cc782..98b4a30eff 100644 --- a/docs/executor/mpi_executor.rst +++ b/docs/executor/mpi_executor.rst @@ -7,6 +7,7 @@ MPI Executor - MPI apps .. autoclass:: MPIExecutor :show-inheritance: :inherited-members: + :exclude-members: serial_setup, sim_default_app, gen_default_app, get_app, default_app, set_resources, get_task, set_workerID, set_worker_info, new_tasks_timing .. automethod:: __init__ diff --git a/docs/executor/overview.rst b/docs/executor/overview.rst index 069b1de5f3..ba0c15926e 100644 --- a/docs/executor/overview.rst +++ b/docs/executor/overview.rst @@ -76,7 +76,7 @@ In user simulation function:: # Has manager sent a finish signal exctr.manager_poll() - if exctr.manager_signal == 'finish': + if exctr.manager_signal in [MAN_SIGNAL_KILL, MAN_SIGNAL_FINISH]: task.kill() my_cleanup() diff --git a/docs/function_guides/generator.rst b/docs/function_guides/generator.rst index 57d301b82a..4f775e7c6f 100644 --- a/docs/function_guides/generator.rst +++ b/docs/function_guides/generator.rst @@ -52,13 +52,17 @@ any other libraries to serve their needs. Persistent Generators --------------------- -While normal generators return after completing their calculation, persistent -generators receive Work units, perform computations, and communicate results -directly to the manager in a loop, not returning until explicitly instructed by -the manager. The calling worker becomes a dedicated :ref:`persistent worker`. +While non-persistent generators return after completing their calculation, persistent +generators receive work units, perform computations, and communicate results +directly to the manager in a loop. A persistent generator returns either when +explicitly instructed by the manager, or by exiting its main loop based on some +condition. The allocation function can determine what to do once a persistent +generator finishes, such as ending the ensemble. + +The calling worker becomes a dedicated :ref:`persistent worker`. A ``gen_f`` is initiated as persistent by the ``alloc_f``. -Most users prefer persistent generators since they do not need to be +Many users prefer persistent generators since they do not need to be re-initialized every time their past work is completed and evaluated by a simulation, and can evaluate returned simulation results over the course of an entire libEnsemble routine as a single function instance. The :doc:`APOSMM<../examples/aposmm>` @@ -129,31 +133,23 @@ By default, a persistent worker (generator in this case) models the manager/work communications of a regular worker (i.e., the generator is expected to alternately receive and send data in a *ping pong* fashion). To have an irregular communication pattern, a worker can be initiated in *active receive* mode by the allocation -function (see :ref:`start_only_persistent`). +function (see :ref:`start_only_persistent`). In this mode, +the persistent worker will always be considered ready to receive more data +(e.g.,~ evaluation results). It can also send to the manager at any time. The user is responsible for ensuring there are no communication deadlocks in this mode. Note that in manager/worker message exchanges, only the worker-side -receive is blocking. +receive is blocking by default (a non-blocking option is available). Cancelling Simulations ---------------------- Previously submitted simulations can be cancelled by sending a message to the manager. -To do this as a separate communication, a persistent generator should be -in *active receive* mode to prevent a deadlock. -To send out cancellations of previously submitted simulations, the generator -can initiate a history array with just the ``sim_id`` and ``cancel_requested`` fields. -Then fill in the ``sim_id``'s to cancel and set the ``cancel_requested`` field to ``True``. -In the following example, ``sim_ids_to_cancel`` is a list of integers. +To do this a PersistentSupport helper function is provided. -.. code-block:: python - - # Send only these fields to existing H rows and libEnsemble will slot in the change. - H_o = np.zeros(len(sim_ids_to_cancel), dtype=[('sim_id', int), ('cancel_requested', bool)]) - H_o['sim_id'] = sim_ids_to_cancel - H_o['cancel_requested'] = True - my_support.send(H_o) +.. currentmodule:: libensemble.tools.persistent_support.PersistentSupport +.. autofunction:: request_cancel_sim_ids If a generated point is cancelled by the generator before it has been given to a worker for evaluation, then it will never be given. If it has already returned from the @@ -165,6 +161,29 @@ by a user function, otherwise it will be ignored. The :doc:`Borehole Calibration tutorial<../tutorials/calib_cancel_tutorial>` gives an example of the capability to cancel pending simulations. +Modification of existing points +------------------------------- + +To change existing fields of the history array, the generator can initialize an output +array where the *dtype* contains the ``sim_id`` and the fields to be modified (in +place of ``gen_specs["out"]``), and then send to the manager as with regular +communications. Any such message received by the manager will modify the given fields +for the given *sim_ids*. If the changes do not correspond with newly generated points, +then the generator needs to communicate to the manager that it is not ready +to receive completed evaluations. Send to the manager with the ``keep_state`` argument +set to *True*. + +For example, the cancellation function ``request_cancel_sim_ids`` could be replicated by +the following (where ``sim_ids_to_cancel`` is a list of integers): + +.. code-block:: python + + # Send only these fields to existing H rows and libEnsemble will slot in the change. + H_o = np.zeros(len(sim_ids_to_cancel), dtype=[('sim_id', int), ('cancel_requested', bool)]) + H_o['sim_id'] = sim_ids_to_cancel + H_o['cancel_requested'] = True + ps.send(H_o, keep_state=True) + Generator initiated shutdown ---------------------------- @@ -176,5 +195,5 @@ the ensemble as soon a persistent generator returns. The usual return values sho Examples -------- -Examples of normal and persistent generator functions +Examples of non-persistent and persistent generator functions can be found :doc:`here<../examples/gen_funcs>`. diff --git a/docs/images/central_balsam.png b/docs/images/central_balsam.png deleted file mode 100644 index d185082970..0000000000 Binary files a/docs/images/central_balsam.png and /dev/null differ diff --git a/docs/images/centralized_new.png b/docs/images/centralized_new.png deleted file mode 100644 index 66dce35b24..0000000000 Binary files a/docs/images/centralized_new.png and /dev/null differ diff --git a/docs/images/diagram_xml/centralized_new.xml b/docs/images/diagram_xml/centralized_new.xml deleted file mode 100644 index e117931c0f..0000000000 --- a/docs/images/diagram_xml/centralized_new.xml +++ /dev/null @@ -1,2 +0,0 @@ -  \ No newline at end of file diff --git a/docs/images/diagram_xml/distributed_new.xml b/docs/images/diagram_xml/distributed_new.xml deleted file mode 100644 index 39509b0ebe..0000000000 --- a/docs/images/diagram_xml/distributed_new.xml +++ /dev/null @@ -1,2 +0,0 @@ -  \ No newline at end of file diff --git a/docs/images/diagram_xml/funcx.xml b/docs/images/diagram_xml/funcx.xml deleted file mode 100644 index 5017489f65..0000000000 --- a/docs/images/diagram_xml/funcx.xml +++ /dev/null @@ -1 +0,0 @@  \ No newline at end of file diff --git a/docs/images/distributed_new.png b/docs/images/distributed_new.png deleted file mode 100644 index 367bf1539b..0000000000 Binary files a/docs/images/distributed_new.png and /dev/null differ diff --git a/docs/images/funcx.png b/docs/images/funcx.png deleted file mode 100644 index fc3da631cb..0000000000 Binary files a/docs/images/funcx.png and /dev/null differ diff --git a/docs/images/libE_logo_smaller.png b/docs/images/libE_logo_smaller.png deleted file mode 100644 index 7a5df04606..0000000000 Binary files a/docs/images/libE_logo_smaller.png and /dev/null differ diff --git a/docs/images/logo_manager_worker.png b/docs/images/logo_manager_worker.png deleted file mode 100644 index 823ea93538..0000000000 Binary files a/docs/images/logo_manager_worker.png and /dev/null differ diff --git a/docs/images/using_new.png b/docs/images/using_new.png deleted file mode 100644 index 05807aa0da..0000000000 Binary files a/docs/images/using_new.png and /dev/null differ diff --git a/docs/images/white.png b/docs/images/white.png deleted file mode 100644 index d04873f1ad..0000000000 Binary files a/docs/images/white.png and /dev/null differ diff --git a/docs/introduction_latex.rst b/docs/introduction_latex.rst index c552e18146..95d081f28c 100644 --- a/docs/introduction_latex.rst +++ b/docs/introduction_latex.rst @@ -30,6 +30,7 @@ We now present further information on running and testing libEnsemble. .. _Community Examples repository: https://github.com/Libensemble/libe-community-examples .. _Conda: https://docs.conda.io/en/latest/ .. _conda-forge: https://conda-forge.org/ +.. _Contributions: https://github.com/Libensemble/libensemble/blob/main/CONTRIBUTING.rst .. _Coveralls: https://coveralls.io/github/Libensemble/libensemble?branch=main .. _DEAP: https://deap.readthedocs.io/en/master/overview.html .. _DFO-LS: https://github.com/numericalalgorithmsgroup/dfols @@ -54,6 +55,8 @@ We now present further information on running and testing libEnsemble. .. _petsc4py: https://bitbucket.org/petsc/petsc4py .. _PETSc/TAO: http://www.mcs.anl.gov/petsc .. _poster: https://figshare.com/articles/libEnsemble_A_Python_Library_for_Dynamic_Ensemble-Based_Computations/12559520 +.. _PSI/J: https://exaworks.org/psij +.. _psi-j-python: https://github.com/ExaWorks/psi-j-python .. _psutil: https://pypi.org/project/psutil/ .. _PyPI: https://pypi.org .. _pytest-cov: https://pypi.org/project/pytest-cov/ @@ -72,6 +75,7 @@ We now present further information on running and testing libEnsemble. .. _tarball: https://github.com/Libensemble/libensemble/releases/latest .. _Tasmanian: https://tasmanian.ornl.gov/ .. _Theta: https://www.alcf.anl.gov/alcf-resources/theta +.. _tqdm: https://tqdm.github.io/ .. _user guide: https://libensemble.readthedocs.io/en/latest/programming_libE.html .. _VTMOP: https://informs-sim.org/wsc20papers/311.pdf .. _WarpX: https://warpx.readthedocs.io/en/latest/ diff --git a/docs/platforms/example_scripts.rst b/docs/platforms/example_scripts.rst index 3163aa52f0..f6e9989685 100644 --- a/docs/platforms/example_scripts.rst +++ b/docs/platforms/example_scripts.rst @@ -5,6 +5,10 @@ Below are example submission scripts used to configure and launch libEnsemble on a variety of high-powered systems. See :doc:`here` for more information about the respective systems and configuration. +Alternatively to interacting with the scheduler or configuring submission scripts, +libEnsemble now features a portable set of :ref:`command-line utilities` +for submitting workflows to almost any system or scheduler. + Slurm - Basic ------------- diff --git a/docs/platforms/perlmutter.rst b/docs/platforms/perlmutter.rst index c2f97bdbd3..96998949f7 100644 --- a/docs/platforms/perlmutter.rst +++ b/docs/platforms/perlmutter.rst @@ -114,7 +114,7 @@ four resource sets (the example generator does not need dedicated resources): .. code-block:: python - libE_specs['zero_resource_workers'] = [1] + libE_specs['num_resource_sets'] = 4 The MPIExecutor is also initiated in the calling script, ensuring that ``srun`` is picked up:: diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index ecd8d436dc..3b719c26b0 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -139,7 +139,7 @@ Zero-resource workers Users with persistent ``gen_f`` functions may notice that the persistent workers are still automatically assigned system resources. This can be resolved by -using :ref:`zero resource workers`. +:ref:`fixing the number of resource sets`. Overriding Auto-detection ------------------------- @@ -153,6 +153,8 @@ libE_specs option. When using the MPI Executor, it is possible to override the detected information using the `custom_info` argument. See the :doc:`MPI Executor<../executor/mpi_executor>` for more. + .. _funcx_ref: + funcX - Remote User functions ----------------------------- @@ -161,9 +163,9 @@ internet access (laptops, login nodes, other servers, etc.), workers can be inst launch generator or simulator user function instances to separate resources from themselves via funcX_, a distributed, high-performance function-as-a-service platform: - .. image:: ../images/funcx.png + .. image:: ../images/funcxmodel.png :alt: running_with_funcx - :scale: 40 + :scale: 50 :align: center This is useful for running ensembles across machines and heterogeneous resources, but diff --git a/docs/resource_manager/overview.rst b/docs/resource_manager/overview.rst index 7a2dff656e..93f8b02009 100644 --- a/docs/resource_manager/overview.rst +++ b/docs/resource_manager/overview.rst @@ -8,32 +8,28 @@ libEnsemble comes with built-in resource management. This entails the :ref:`detection of available resources` (e.g., nodelists and core counts), and the allocation of resources to workers. -By default, the provisioned resources are divided by the number of workers (excluding -any workers given in the ``zero_resource_workers`` libE_specs option). libEnsemble's -:doc:`MPI Executor<../executor/mpi_executor>` is aware of these supplied resources, -and if not given any of ``num_nodes``, ``num_procs``, or ``procs_per_node`` in the submit -function, it will try to use all nodes and CPU cores available to the worker. +By default, the provisioned resources are divided by the number of workers. +libEnsemble's :doc:`MPI Executor<../executor/mpi_executor>` is aware of +these supplied resources, and if not given any of ``num_nodes``, ``num_procs``, +or ``procs_per_node`` in the submit function, it will try to use all nodes and +CPU cores available to the worker. Detected resources can be overridden using the libE_specs option :ref:`resource_info`. -Resource management can be disabled by setting -``libE_specs['disable_resource_manager'] = True``. This will prevent libEnsemble -from doing any resource detection or management. - Variable resource assignment ---------------------------- In slightly more detail, the resource manager divides resources into **resource sets**. One resource set is the smallest unit of resources that can be assigned (and dynamically reassigned) to workers. By default, the provisioned resources are -divided by the number of workers (excluding any workers given in the ``zero_resource_workers`` -``libE_specs`` option). However, it can also be set directly by the ``num_resource_sets`` -``libE_specs`` option. If the latter is set, the dynamic resource assignment algorithm -will always be used. +divided by the number of workers (excluding any workers given in the +``zero_resource_workers`` ``libE_specs`` option). However, it can also be set +directly by the ``num_resource_sets`` ``libE_specs`` option. If the latter is set, +the dynamic resource assignment algorithm will always be used. If there are more resource sets than nodes, then the resource sets on each node -will be given a slot number, enumerated from zero. For example, if there are three slots -on a node, they will have slot numbers 0, 1, and 2. +will be given a slot number, enumerated from zero. For example, if there are three +slots on a node, they will have slot numbers 0, 1, and 2. The resource manager will not split a resource set over nodes, rather the resource sets on each node will be the integer division of resource sets over nodes, with @@ -41,7 +37,10 @@ the remainder dealt out from the first node. Even breakdowns are generally preferable, however. For example, say a given system has four GPUs per node, and the user has run -libEnsemble on two nodes, with eight workers. The default division of resources would be: +libEnsemble on two nodes, with eight workers. The default division of resources +would be: + +.. _rsets-diagram: .. image:: ../images/variable_resources1.png @@ -66,7 +65,7 @@ In the calling script, use a ``gen_specs['out']`` field called ``resource_sets`` ('x', float, n)] } -For an example calling script, see The libEnsemble regression test +For an example calling script, see the regression test `test_persistent_sampling_CUDA_variable_resources.py`_ In the generator, the ``resource_sets`` field must be set to a value for each point @@ -122,10 +121,9 @@ For example, in *six_hump_camel_CUDA_variable_resources*, the environment variab :emphasize-lines: 3 resources = Resources.resources.worker_resources - if resources.even_slots: # Need same slots on each node - resources.set_env_to_slots("CUDA_VISIBLE_DEVICES") # Use convenience function. - num_nodes = resources.local_node_count - cores_per_node = resources.slot_count # One CPU per GPU + resources.set_env_to_slots("CUDA_VISIBLE_DEVICES") # Use convenience function. + num_nodes = resources.local_node_count + cores_per_node = resources.slot_count # One CPU per GPU In the figure above, this would result in worker one setting:: @@ -196,17 +194,19 @@ Persistent generator You have *one* persistent generator and want *eight* workers for running concurrent simulations. In this case you can run with *nine* workers. -Either use one zero resource worker, if the generator should always be the same worker: +Either explicitly set eight resource sets (recommended): .. code-block:: python - libE_specs['zero_resource_workers'] = [1] + libE_specs['num_resource_sets'] = 8 -Or explicitly set eight resource sets: +Or if the generator should always be the same worker, use one zero resource worker: .. code-block:: python - libE_specs['num_resource_sets'] = 8 + libE_specs['zero_resource_workers'] = [1] + +For the second option, an allocation function supporting zero resource workers must be used. Using the two-node example above, the initial worker mapping in this example will be: diff --git a/docs/resource_manager/resources_index.rst b/docs/resource_manager/resources_index.rst index a1dae387d1..c1eabef0da 100644 --- a/docs/resource_manager/resources_index.rst +++ b/docs/resource_manager/resources_index.rst @@ -5,13 +5,17 @@ Resource Manager libEnsemble comes with built-in resource management. This entails the detection of available resources (e.g. nodelists and core counts), and the allocation of resources to workers. +Resource management can be disabled by setting +``libE_specs['disable_resource_manager'] = True``. This will prevent libEnsemble +from doing any resource detection or management. + .. toctree:: :maxdepth: 2 :titlesonly: :caption: Resource Manager: + Zero-resource workers (e.g.,~ Persistent gen does not need resources) overview resource_detection - zero_resource_workers scheduler_module - worker_resources + Worker Resources Module (query resources for current worker) diff --git a/docs/resource_manager/zero_resource_workers.rst b/docs/resource_manager/zero_resource_workers.rst index 546e4ea83b..c676fa9a88 100644 --- a/docs/resource_manager/zero_resource_workers.rst +++ b/docs/resource_manager/zero_resource_workers.rst @@ -4,34 +4,81 @@ Zero-resource workers ~~~~~~~~~~~~~~~~~~~~~ Users with persistent ``gen_f`` functions may notice that the persistent workers -are still automatically assigned system resources. This can be wasteful if those -workers only run ``gen_f`` routines in-place and don't use the Executor to submit -applications to allocated nodes: +are still automatically assigned resources. This can be wasteful if those workers +only run ``gen_f`` functions in-place (i.e.,~ they do not use the Executor +to submit applications to allocated nodes). Suppose the user is using the +:meth:`parse_args()` function and runs:: + + python run_ensemble_persistent_gen.py --comms local --nworkers 3 + +If three nodes are available in the node allocation, the result may look like the +following. .. image:: ../images/persis_wasted_node.png :alt: persis_wasted_node :scale: 40 :align: center -This can be resolved by using the libE_specs option ``zero_resource_workers``: -.. code-block:: python +To avoid the the wasted node above, add an extra worker:: - libE_specs['zero_resource_workers'] = [1] + python run_ensemble_persistent_gen.py --comms local --nworkers 4 + +and in the calling script (*run_ensemble_persistent_gen.py*), explicitly set the +number of resource sets to the number of workers that will be running simulations. -in the calling script. Set the parameter ``zero_resource_workers`` to a list of -worker IDs that should not have system resources assigned. +.. code-block:: python -Worker 1 will not be allocated resources. Note that additional worker -processes can be added to take advantage of the free resources (if using the -same resource set) for simulation instances: + nworkers, is_manager, libE_specs, _ = parse_args() + libE_specs['num_resource_sets'] = nworkers - 1 + +When the ``num_resource_sets`` option is used, libEnsemble will use the dynamic +resource scheduler, and any worker may assign work to any node. This works well +for most users. .. image:: ../images/persis_add_worker.png :alt: persis_add_worker :scale: 40 :align: center -An alternative, when resource sets are being used, it to set the ``num_resource_sets`` -libE_specs option explicitly to the required value. The difference with declaring -``zero_resource_workers`` is that a fixed worker will have zero resources (this must -be supported by the allocation function, see :ref:`start_only_persistent`) +**Optional**: An alternative way to express the above would be to use the command +line:: + + python run_ensemble_persistent_gen.py --comms local --nsim_workers 3 + +This would automatically set the ``num_resource_sets`` option and add a single +worker for the persistent generator - a common use-case. + +In general, the number of resource sets should be set to enable the maximum +concurrency desired by the ensemble, taking in to account generators and simulators. +The users can set generator resources by setting ``persis_info['gen_resources']`` +to an integer value, representing the number of resource sets to give to the +generator. The default is zero. + +The available nodes are always divided by the number of resource sets, and there +may be multiple nodes or a partition of a node in each resource set. If the split +is uneven, resource sets are not split between nodes. E.g.~ If there are two nodes +and five resource sets, one node will have three resource sets, and the other will +have two. + +Placing zero-resource functions on a fixed worker +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the generator must must always be on worker one, then instead of using +``num_resource_sets``, use the ``zero_resource_workers`` *libE_specs* option: + + +.. code-block:: python + + libE_specs['zero_resource_workers'] = [1] + + +in the calling script and worker one will not be allocated resources. In general, +set the parameter ``zero_resource_workers`` to a list of worker IDs that should not +have resources assigned. + +This approach can be useful if running in +:doc:`distributed mode<../platforms/platforms_index>`. + +The use of the ``zero_resource_workers`` *libE_specs* option must be supported by +the allocation function, see :ref:`start_only_persistent`) diff --git a/docs/running_libE.rst b/docs/running_libE.rst index 5a04b2a08e..ab3213a755 100644 --- a/docs/running_libE.rst +++ b/docs/running_libE.rst @@ -106,6 +106,24 @@ The ``libE_specs`` options for TCP are:: 'authkey' [String]: Authkey. +Reverse-ssh interface +^^^^^^^^^^^^^^^^^^^^^ + +Via specifying ``--comms ssh`` on the command line, libEnsemble workers can +be launched to remote ssh-accessible systems without needing to specify ``'port'`` or ``'authkey'``. This allows users +to colocate workers, simulation or generator functions, and any applications they submit on the same machine. Such user +functions can also be persistent, unlike when launching remote functions via :ref:`funcX`. + +The working directory and Python to run on the remote system need to be specified. Running a calling script may resemble:: + + python myscript.py --comms ssh --workers machine1 machine2 --worker_pwd /home/workers --worker_python /home/.conda/.../python + +.. note:: + Setting up public-key authentication on the worker host systems is recommended to avoid entering passwords. + +.. note:: + This interface assumes that all remote machines share a filesystem. We'll be adjusting this in the future. + Limitations of TCP mode ^^^^^^^^^^^^^^^^^^^^^^^ @@ -116,6 +134,147 @@ Further command line options See the **parse_args()** function in :doc:`Convenience Tools` for further command line options. +.. _liberegister: + +liberegister / libesubmit +------------------------- + +libEnsemble now features a pair of command-line utilities for preparing and launching libEnsemble workflows onto almost +any machine and any scheduler, using a `PSI/J`_ Python implementation. This is an alternative approach +to maintaining system or scheduler-specific batch submission scripts. + +- `liberegister` + +Creates an initial, platform-independent PSI/J serialization of a libEnsemble submission. Run this utility on +a calling script in a familiar manner:: + + liberegister my_calling_script.py --comms local --nworkers 4 + +This produces an initial `my_calling_script.json` serialization conforming to PSI/J's specification: + +.. container:: toggle + + .. container:: header + + `my_calling_script.json` + + .. code-block:: JSON + + { + "version": 0.1, + "type": "JobSpec", + "data": { + "name": "libe-job", + "executable": "python", + "arguments": [ + "my_calling_script.py", + "--comms", + "local", + "--nworkers", + "4" + ], + "directory": null, + "inherit_environment": true, + "environment": { + "PYTHONNOUSERSITE": "1" + }, + "stdin_path": null, + "stdout_path": null, + "stderr_path": null, + "resources": { + "node_count": 1, + "process_count": null, + "process_per_node": null, + "cpu_cores_per_process": null, + "gpu_cores_per_process": null, + "exclusive_node_use": true + }, + "attributes": { + "duration": "30", + "queue_name": null, + "project_name": null, + "reservation_id": null, + "custom_attributes": {} + }, + "launcher": null + } + } + +- `libesubmit` + +Further parameterizes a serialization, and submits a corresponding Job to the specified scheduler. +Running ``qsub``, ``sbatch``, etc. on some batch submission script is not needed. For instance:: + + libesubmit my_calling_script.json -q debug -A project -s slurm --nnodes 8 + +Results in:: + + *** libEnsemble 0.9.2+dev *** + Imported PSI/J serialization: my_calling_script.json. Preparing submission... + Calling script: my_calling_script.py + ...found! Proceeding. + Submitting Job!: Job[id=ce4ead75-a3a4-42a3-94ff-c44b3b2c7e61, native_id=None, executor=None, status=JobStatus[NEW, time=1658167808.5125017]] + + $ squeue --long --users=user + Mon Jul 18 13:10:15 2022 + JOBID PARTITION NAME USER STATE TIME TIME_LIMI NODES NODELIST(REASON) + 2508936 debug ce4ead75 user PENDING 0:00 30:00 8 (Priority) + +This also produces a Job-specific representation, e.g: + +.. container:: toggle + + .. container:: header + + `8ba9de56.my_calling_script.json` + + .. code-block:: JSON + + { + "version": 0.1, + "type": "JobSpec", + "data": { + "name": "libe-job", + "executable": "/Users/jnavarro/miniconda3/envs/libe/bin/python3.8", + "arguments": [ + "my_calling_script.py", + "--comms", + "local", + "--nworkers", + "4" + ], + "directory": "/home/user/libensemble/scratch", + "inherit_environment": true, + "environment": { + "PYTHONNOUSERSITE": "1" + }, + "stdin_path": null, + "stdout_path": "8ba9de56.my_calling_script.out", + "stderr_path": "8ba9de56.my_calling_script.err", + "resources": { + "node_count": 8, + "process_count": null, + "process_per_node": null, + "cpu_cores_per_process": null, + "gpu_cores_per_process": null, + "exclusive_node_use": true + }, + "attributes": { + "duration": "30", + "queue_name": "debug", + "project_name": "project", + "reservation_id": null, + "custom_attributes": {} + }, + "launcher": null + } + } + +If libesubmit is run on a ``.json`` serialization from liberegister and can't find the +specified calling script, it'll help search for matching candidate scripts. + +.. _PSI/J: https://exaworks.org/psij + Persistent Workers ------------------ .. _persis_worker: diff --git a/docs/scipy2020.rst b/docs/scipy2020.rst index 8fbd0c98e7..ef9249ef24 100644 --- a/docs/scipy2020.rst +++ b/docs/scipy2020.rst @@ -8,11 +8,6 @@ :width: 33 % :align: right -.. image:: images/white.png - :align: center - :width: 33 % - :height: 1.2 in - ========================================================================= **libEnsemble**: A Python Library for Dynamic Ensemble-Based Computations ========================================================================= @@ -51,11 +46,6 @@ input and a ``sim_f`` function that performs and monitors simulations. The user parameterizes these functions and initiates libEnsemble in a *calling script*. Examples and templates of such scripts and functions are included in the library. -.. image:: images/using_new.png - :alt: Using libEnsemble - :scale: 30 % - :align: center - For example, the ``gen_f`` may contain an optimization routine to generate new simulation parameters on-the-fly based on results from previous ``sim_f`` simulations. @@ -82,11 +72,6 @@ many-node simulations. The *manager* allocates workers to asynchronously execute ``gen_f`` generation functions and ``sim_f`` simulation functions based on produced output, directed by a provided ``alloc_f`` allocation function. -.. image:: images/logo_manager_worker.png - :alt: Managers and Workers - :align: center - :scale: 40 % - Flexible Run Mechanisms ----------------------- @@ -97,14 +82,14 @@ run and launch tasks (user applications) on available nodes. * **Distributed**: Workers are distributed across allocated nodes and launch tasks in-place. Workers share nodes with their applications. -.. image:: images/distributed_new.png +.. image:: images/distributed_new_detailed.png :alt: Distributed :align: center :scale: 30 % * **Centralized**: Workers run on one or more dedicated nodes and launch tasks to the remaining allocated nodes. -.. image:: images/centralized_new.png +.. image:: images/centralized_new_detailed.png :alt: Centralized :align: center :scale: 30 % @@ -132,7 +117,7 @@ Executor can interface with the **Balsam** library, which functions as a proxy job launcher that maintains and submits jobs from a database on front end launch nodes. -.. image:: images/central_balsam.png +.. image:: images/centralized_new_detailed_balsam.png :alt: Central Balsam :align: center :scale: 40 % diff --git a/docs/tutorials/calib_cancel_tutorial.rst b/docs/tutorials/calib_cancel_tutorial.rst index 718d6e4398..1ffd31922c 100644 --- a/docs/tutorials/calib_cancel_tutorial.rst +++ b/docs/tutorials/calib_cancel_tutorial.rst @@ -25,12 +25,8 @@ Overview of the Calibration Problem The generator function featured in this tutorial can be found in ``gen_funcs/persistent_surmise_calib.py`` and uses the `surmise`_ library for its -calibration surrogate model interface. - -.. note:: - Note that this repository is a fork of - the main surmise repository, but it retains support for the "PCGPwM" emulation - method used in the generator. surmise is under active development. +calibration surrogate model interface. The surmise library uses the "PCGPwM" +emulation method in this example. Say there is a computer model :math:`f(\theta, x)` to be calibrated. To calibrate is to find some parameter :math:`\theta_0` such that :math:`f(\theta_0, x)` closely @@ -105,27 +101,25 @@ cancelled ("obviated"). If so, the generator then calls ``cancel_columns()``:: ... c_obviate = info['obviatesugg'] # suggested if len(c_obviate) > 0: - cancel_columns(obs_offset, c_obviate, n_x, pending, comm) + cancel_columns(obs_offset, c_obviate, n_x, pending, ps) ``obs_offset`` is an offset that excludes the observations when mapping points in surmise data structures to ``sim_id``'s, ``c_obviate`` is a selection of columns to cancel, ``n_x`` is the number of ``x`` values, and ``pending`` is used -to check that points marked for cancellation have not already returned. ``comm`` is a -communicator object from :doc:`libE_info<../data_structures/work_dict>` used to send -and receive messages from the Manager. +to check that points marked for cancellation have not already returned. ``ps`` is the +instantiation of the *PersistentSupport* class that is set up for persistent generators, and +provides an interface for communication with the manager. Within ``cancel_columns()``, each column in ``c_obviate`` is iterated over, and if a point is ``pending`` and thus has not yet been evaluated by a simulation, its ``sim_id`` is appended to a list to be sent to the Manager for cancellation. -A new, separate local :doc:`History array<../history_output_logging>` is defined with the -selected ``'sim_id'`` s and the ``'cancel_requested'`` field set to ``True``. This array is -then sent to the Manager using the ``send_mgr_worker_msg`` persistent generator -helper function. Each of these helper functions is described :ref:`here`. -The entire ``cancel_columns()`` routine is listed below: +Cancellation is requested using the helper function ``request_cancel_sim_ids`` provided +by the *PersistentSupport* class. Each of these helper functions is described +:ref:`here`. The entire ``cancel_columns()`` routine is listed below: .. code-block:: python - def cancel_columns(obs_offset, c, n_x, pending, comm): + def cancel_columns(obs_offset, c, n_x, pending, ps): """Cancel columns""" sim_ids_to_cancel = [] columns = np.unique(c) @@ -137,11 +131,7 @@ The entire ``cancel_columns()`` routine is listed below: sim_ids_to_cancel.append(sim_id_cancel) pending[i, c] = 0 - # Send only these fields to existing H rows and libEnsemble will slot in the change. - H_o = np.zeros(len(sim_ids_to_cancel), dtype=[('sim_id', int), ('cancel_requested', bool)]) - H_o['sim_id'] = sim_ids_to_cancel - H_o['cancel_requested'] = True - send_mgr_worker_msg(comm, H_o) + ps.request_cancel_sim_ids(sim_ids_to_cancel) In future calls to the allocation function by the manager, points that would have been distributed for simulation work but are now marked with "cancel_requested" will not @@ -174,11 +164,10 @@ This is calculated from other parameters in the calling script. By default, workers (including persistent workers), are only allocated work when they're in an :doc:`idle or non-active state<../data_structures/worker_array>`. -However, since this generator must asynchronously update its model and -cancel pending evaluations, the worker running this generator remains -in an *active receive* state, until it becomes non-persistent. This means -both the manager and persistent worker (generator in this case) must be -prepared for irregular sending /receiving of data. +However, since this generator must asynchronously update its model, the worker +running this generator remains in an *active receive* state, until it becomes +non-persistent. This means both the manager and persistent worker (generator in +this case) must be prepared for irregular sending/receiving of data. .. Manager - Cancellation, History Updates, and Allocation .. ------------------------------------------------------- @@ -240,7 +229,7 @@ prepared for irregular sending /receiving of data. .. # Poll task for finish and poll manager for kill signals .. while(not task.finished): .. exctr.manager_poll() -.. if exctr.manager_signal == 'kill': +.. if exctr.manager_signal == MAN_SIGNAL_KILL: .. task.kill() .. calc_status = MAN_SIGNAL_KILL .. break @@ -256,7 +245,7 @@ prepared for irregular sending /receiving of data. .. While the launched task isn't finished, the simulator function periodically polls .. both the task's statuses and for signals from the manager via .. the :ref:`executor.manager_poll()` function. -.. Immediately after ``exctr.manager_signal`` is confirmed as ``'kill'``, the current +.. Immediately after ``exctr.manager_signal`` is confirmed as ``MAN_SIGNAL_KILL``, the current .. task is killed and the function returns with the .. ``MAN_SIGNAL_KILL`` :doc:`calc_status<../data_structures/calc_status>`. .. This status will be logged in ``libE_stats.txt``. diff --git a/docs/tutorials/forces_gpu_tutorial.rst b/docs/tutorials/forces_gpu_tutorial.rst index 0ce1054cda..bae79dcc6d 100644 --- a/docs/tutorials/forces_gpu_tutorial.rst +++ b/docs/tutorials/forces_gpu_tutorial.rst @@ -22,9 +22,12 @@ GPU build lines in build_forces.sh_ or similar for your platform. The libEnsemble scripts in this example are available under forces_gpu_ in the libEnsemble repository. -Note that at time of writing the calling script ``run_libe_forces.py`` is identical -to that in ``forces_simple``. The ``forces_simf`` file has slight modifications to -assign GPUs. +Note that at time of writing the calling script **run_libe_forces.py** is functionally +the same as that in *forces_simple*, but contains some commented out lines that can +be used for a variable resources example. The *forces_simf.py* file has slight modifications +to assign GPUs. + +Videos demonstrate running this example on Perlmutter_ and Spock_. Simulation function ------------------- @@ -106,6 +109,11 @@ and the line:: will set the environment variable ``CUDA_VISIBLE_DEVICES`` to match the assigned slots (partitions on the node). +.. note:: + **slots** refers to the ``resource sets`` enumerated on a node (starting with + zero). If a resource set has more than one node, then each node is considered to + have slot zero. [:ref:`diagram`] + Note that if you are on a system that automatically assigns free GPUs on the node, then setting ``CUDA_VISIBLE_DEVICES`` is not necessary unless you want to ensure workers are strictly bound to GPUs. For example, on many **SLURM** systems, you @@ -132,17 +140,14 @@ eight workers. For example:: python run_libe_forces.py --comms local --nworkers 8 -If you are running one persistent generator which does not require -resources, then assign nine workers, and set the following in your -calling script:: - - libE_specs['zero_resource_workers'] = [1] - -Or - if you do not care which worker runs the generator, you could fix the -*resource_sets*:: +Note that if you are running one persistent generator which does not require +resources, then assign nine workers, and fix the number of *resource_sets* in +you calling script:: libE_specs['num_resource_sets'] = 8 +See :ref:`zero resource workers` for more ways to express this. + Changing number of GPUs per worker ---------------------------------- @@ -157,14 +162,20 @@ Varying resources ----------------- The same code can be used when varying worker resources. In this case, you may -choose to set one worker per GPU (as we did originally). Then add ``resource_sets`` -as a ``gen_specs['out']`` in your calling script. Simply assign the -``resource_sets`` field of :doc:`H<../data_structures/history_array>` for each point -generated. +add an integer field called ``resource_sets`` as a ``gen_specs['out']`` in your +calling script. + +In the generator function, assign the ``resource_sets`` field of +:doc:`H<../data_structures/history_array>` for each point generated. For example +if a larger simulation requires two MPI tasks (and two GPUs), set ``resource_sets`` +field to *2* for that sim_id in the generator function. + +The calling script run_libe_forces.py_ contains alternative commented out lines for +a variable resource example. Search for "Uncomment for var resources" -In this case the above code would still work, assigning one CPU processor and -one GPU to each rank. If you want to have one rank with multiple GPUs, then -change source lines 29/30 accordingly. +In this case, the simulator function will still work, assigning one CPU processor +and one GPU to each MPI rank. If you want to have one rank with multiple GPUs, +then change source lines 29/30 accordingly. Further guidance on varying resource to workers can be found under the :doc:`resource manager<../resource_manager/resources_index>`. @@ -229,3 +240,6 @@ resource conflicts on each node. .. _forces_gpu: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_gpu .. _forces.c: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_app/forces.c .. _build_forces.sh: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh +.. _Perlmutter: https://www.youtube.com/watch?v=Av8ctYph7-Y +.. _Spock: https://www.youtube.com/watch?v=XHXcslDORjU +.. _run_libe_forces.py: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_gpu/run_libe_forces.py diff --git a/examples/tutorials/aposmm/tutorial_six_hump_camel.py b/examples/tutorials/aposmm/tutorial_six_hump_camel.py index 3d8a2bb348..fb9ea363c2 100644 --- a/examples/tutorials/aposmm/tutorial_six_hump_camel.py +++ b/examples/tutorials/aposmm/tutorial_six_hump_camel.py @@ -3,7 +3,6 @@ def six_hump_camel(H, persis_info, sim_specs, _): """Six-Hump Camel sim_f.""" - batch = len(H["x"]) # Num evaluations each sim_f call. H_o = np.zeros(batch, dtype=sim_specs["out"]) # Define output array H diff --git a/install/testing_requirements.txt b/install/testing_requirements.txt index c33b06649e..e3d464f3d5 100644 --- a/install/testing_requirements.txt +++ b/install/testing_requirements.txt @@ -5,3 +5,4 @@ pytest-cov==2.12.1 pytest-timeout==1.4.2 mock==4.0.3 coveralls==3.2.0 +python-dateutil \ No newline at end of file diff --git a/libensemble/comms/logs.py b/libensemble/comms/logs.py index 997f312641..628f5bb59a 100644 --- a/libensemble/comms/logs.py +++ b/libensemble/comms/logs.py @@ -120,7 +120,6 @@ def init_worker_logger(logr, lev): def worker_logging_config(comm, worker_id=None): """Add a comm handler with worker ID filter to the indicated logger.""" - logconfig = LogConfig.config logger = logging.getLogger(logconfig.name) slogger = logging.getLogger(logconfig.stats_name) @@ -142,7 +141,6 @@ def worker_logging_config(comm, worker_id=None): def manager_logging_config(): """Add file-based logging at manager.""" - stat_timer = Timer() stat_timer.start() diff --git a/libensemble/executors/balsam_executors/balsam_executor.py b/libensemble/executors/balsam_executors/balsam_executor.py index 969b10ba35..6650f32829 100644 --- a/libensemble/executors/balsam_executors/balsam_executor.py +++ b/libensemble/executors/balsam_executors/balsam_executor.py @@ -12,7 +12,7 @@ In order to initiate a Balsam executor, the calling script should contain :: - from libensemble.executors import BalsamExecutor + from libensemble.executors.balsam_executors import BalsamExecutor exctr = BalsamExecutor() Key differences to consider between this executor and libEnsemble's others is @@ -21,7 +21,7 @@ This process may resemble:: - from libensemble.executors import BalsamExecutor + from libensemble.executors.balsam_executors import BalsamExecutor from balsam.api import ApplicationDefinition class HelloApp(ApplicationDefinition): @@ -79,7 +79,6 @@ class HelloApp(ApplicationDefinition): ExecutorException, TimeoutExpired, jassert, - STATES, ) from libensemble.executors import Executor @@ -120,7 +119,6 @@ def __init__( def _get_time_since_balsam_submit(self): """Return time since balsam task entered ``RUNNING`` state""" - event_query = EventLog.objects.filter(job_id=self.process.id, to_state="RUNNING") if not len(event_query): return 0 @@ -133,7 +131,6 @@ def _get_time_since_balsam_submit(self): def calc_task_timing(self): """Calculate timing information for this task""" - # Get runtime from Balsam self.runtime = self._get_time_since_balsam_submit() @@ -162,13 +159,10 @@ def _set_complete(self, dry_run=False): ]: self.success = True self.state = "FINISHED" - elif balsam_state in STATES: # In my states - self.state = balsam_state else: - logger.warning("Task finished, but in unrecognized " "Balsam state {}".format(balsam_state)) - self.state = "UNKNOWN" + self.state = balsam_state - logger.info("Task {} ended with state {}".format(self.name, self.state)) + logger.info("Task {} ended with state {}".format(self.name, self.state)) def poll(self): """Polls and updates the status attributes of the supplied task. Requests @@ -202,12 +196,7 @@ def poll(self): elif balsam_state in ["RUN_ERROR", "RUN_TIMEOUT", "FAILED"]: self.state = "FAILED" - - else: - raise ExecutorException( - "Task state returned from Balsam is not in known list of " - "Balsam states. Task state is {}".format(balsam_state) - ) + self._set_complete() def wait(self, timeout=None): """Waits on completion of the task or raises ``TimeoutExpired``. @@ -220,9 +209,7 @@ def wait(self, timeout=None): timeout: int or float, optional Time in seconds after which a TimeoutExpired exception is raised. If not set, then simply waits until completion. - Note that the task is not automatically killed if libEnsemble - timeouts from reaching exit_criteria["wallclock_max"]. - + Note that the task is not automatically killed on timeout. """ if self.dry_run: @@ -239,6 +226,9 @@ def wait(self, timeout=None): "POSTPROCESSED", "STAGED_OUT", "JOB_FINISHED", + "RUN_ERROR", + "RUN_TIMEOUT", + "FAILED", ]: time.sleep(0.2) self.process.refresh_from_db() @@ -251,7 +241,6 @@ def wait(self, timeout=None): def kill(self): """Cancels the supplied task. Killing is unsupported at this time.""" - self.process.delete() logger.info("Killing task {}".format(self.name)) @@ -270,7 +259,6 @@ class BalsamExecutor(Executor): def __init__(self): """Instantiate a new ``BalsamExecutor`` instance.""" - super().__init__() self.workflow_name = "libe_workflow" @@ -280,11 +268,11 @@ def serial_setup(self): """Balsam serial setup includes emptying database and adding applications""" pass - def add_app(self, name, site, exepath, desc): + def add_app(self, *args): """Sync application with Balsam service""" pass - def register_app(self, BalsamApp, app_name, calc_type=None, desc=None, precedent=None): + def register_app(self, BalsamApp, app_name=None, calc_type=None, desc=None, precedent=None): """Registers a Balsam ``ApplicationDefinition`` to libEnsemble. This class instance *must* have a ``site`` and ``command_template`` specified. See the Balsam docs for information on other optional fields. @@ -398,7 +386,7 @@ def submit_allocation( return allocation - def revoke_allocation(self, allocation): + def revoke_allocation(self, allocation, timeout=60): """ Terminates a Balsam ``BatchJob`` machine allocation remotely. Balsam apps should no longer be submitted to this allocation. Best to run after libEnsemble @@ -409,16 +397,27 @@ def revoke_allocation(self, allocation): allocation: ``BatchJob`` object a ``BatchJob`` with a corresponding machine allocation that should be cancelled. + + timeout: int, optional + Timeout and warn user after this many seconds of attempting to revoke an allocation. """ allocation.refresh_from_db() + start = time.time() + while not allocation.scheduler_id: time.sleep(1) allocation.refresh_from_db() + if time.time() - start > timeout: + logger.warning( + "Unable to terminate Balsam BatchJob. You may need to login to the machine and manually remove it." + ) + return False batchjob = BatchJob.objects.get(scheduler_id=allocation.scheduler_id) batchjob.state = "pending_deletion" batchjob.save() + return True def set_resources(self, resources): self.resources = resources @@ -523,10 +522,6 @@ def submit( if machinefile is not None: logger.warning("machinefile arg ignored - not supported in Balsam") - jassert( - num_procs or num_nodes or procs_per_node, - "No procs/nodes provided - aborting", - ) task = BalsamTask(app, app_args, workdir, None, None, self.workerID) diff --git a/libensemble/executors/balsam_executors/legacy_balsam_executor.py b/libensemble/executors/balsam_executors/legacy_balsam_executor.py index cf1a7b3aee..ede946a3de 100644 --- a/libensemble/executors/balsam_executors/legacy_balsam_executor.py +++ b/libensemble/executors/balsam_executors/legacy_balsam_executor.py @@ -65,7 +65,6 @@ def read_stderr(self): def _get_time_since_balsam_submit(self): """Return time since balsam task entered RUNNING state""" - # If wait_on_start then can could calculate runtime same a base executor # but otherwise that will return time from task submission. Get from Balsam. @@ -79,7 +78,6 @@ def _get_time_since_balsam_submit(self): def calc_task_timing(self): """Calculate timing information for this task""" - # Get runtime from Balsam self.runtime = self._get_time_since_balsam_submit() @@ -152,8 +150,7 @@ def wait(self, timeout=None): timeout: int or float, optional Time in seconds after which a TimeoutExpired exception is raised. If not set, then simply waits until completion. - Note that the task is not automatically killed if libEnsemble - timeouts from reaching exit_criteria["wallclock_max"]. + Note that the task is not automatically killed on timeout. """ if self.dry_run: @@ -177,7 +174,6 @@ def wait(self, timeout=None): def kill(self, wait_time=None): """Kills or cancels the supplied task""" - dag.kill(self.process) # Could have Wait here and check with Balsam its killed - diff --git a/libensemble/executors/executor.py b/libensemble/executors/executor.py index 5fd19ae0d9..d23b3ead27 100644 --- a/libensemble/executors/executor.py +++ b/libensemble/executors/executor.py @@ -18,8 +18,7 @@ from libensemble.message_numbers import ( UNSET_TAG, - MAN_SIGNAL_FINISH, - MAN_SIGNAL_KILL, + MAN_KILL_SIGNALS, WORKER_DONE, TASK_FAILED, WORKER_KILL_ON_TIMEOUT, @@ -244,8 +243,7 @@ def wait(self, timeout=None): timeout: int or float, optional Time in seconds after which a TimeoutExpired exception is raised. If not set, then simply waits until completion. - Note that the task is not automatically killed if libEnsemble - timeouts from reaching exit_criteria["wallclock_max"]. + Note that the task is not automatically killed on timeout. """ if self.dry_run: @@ -270,8 +268,7 @@ def result(self, timeout=None): timeout: int or float, optional Time in seconds after which a TimeoutExpired exception is raised. If not set, then simply waits until completion. - Note that the task is not automatically killed if libEnsemble - timeouts from reaching exit_criteria["wallclock_max"]. + Note that the task is not automatically killed on timeout. """ self.wait(timeout=timeout) @@ -286,8 +283,7 @@ def exception(self, timeout=None): timeout: int or float, optional Time in seconds after which a TimeoutExpired exception is raised. If not set, then simply waits until completion. - Note that the task is not automatically killed if libEnsemble - timeouts from reaching exit_criteria["wallclock_max"]. + Note that the task is not automatically killed on timeout. """ self.wait(timeout=timeout) @@ -350,6 +346,7 @@ class Executor: **Object Attributes:** :ivar list list_of_tasks: A list of tasks created in this executor + :ivar int manager_signal: The most recent manager signal received since manager_poll() was called. """ executor = None @@ -386,7 +383,7 @@ def __init__(self): This is typically created in the user calling script. """ - self.manager_signal = "none" + self.manager_signal = None self.default_apps = {"sim": None, "gen": None} self.apps = {} @@ -485,7 +482,7 @@ def manager_poll(self): The executor manager_signal attribute will be updated. """ - self.manager_signal = "none" # Reset + self.manager_signal = None # Reset # Check for messages; disregard anything but a stop signal if not self.comm.mail_flag(): @@ -495,16 +492,24 @@ def manager_poll(self): return # Process the signal and push back on comm (for now) - logger.info("Worker received kill signal {} from manager".format(man_signal)) - if man_signal == MAN_SIGNAL_FINISH: - self.manager_signal = "finish" - elif man_signal == MAN_SIGNAL_KILL: - self.manager_signal = "kill" + self.manager_signal = man_signal + + if man_signal in MAN_KILL_SIGNALS: + # Only kill signals exist currently + logger.info("Worker received kill signal {} from manager".format(man_signal)) else: logger.warning("Received unrecognized manager signal {} - ignoring".format(man_signal)) + self.comm.push_to_buffer(mtag, man_signal) return man_signal + def manager_kill_received(self): + """Return True if received kill signal from the manager""" + man_signal = self.manager_poll() + if man_signal in MAN_KILL_SIGNALS: + return True + return False + def polling_loop(self, task, timeout=None, delay=0.1, poll_manager=False): """Optional, blocking, generic task status polling loop. Operates until the task finishes, times out, or is optionally killed via a manager signal. On completion, returns a @@ -542,7 +547,7 @@ def polling_loop(self, task, timeout=None, delay=0.1, poll_manager=False): if poll_manager: man_signal = self.manager_poll() - if self.manager_signal != "none": + if self.manager_signal in MAN_KILL_SIGNALS: task.kill() calc_status = man_signal break diff --git a/libensemble/executors/mpi_executor.py b/libensemble/executors/mpi_executor.py index 7b0697b583..49a2fe3e1a 100644 --- a/libensemble/executors/mpi_executor.py +++ b/libensemble/executors/mpi_executor.py @@ -23,7 +23,13 @@ class MPIExecutor(Executor): - """The MPI executor can create, poll and kill runnable MPI tasks""" + """The MPI executor can create, poll and kill runnable MPI tasks + + **Object Attributes:** + + :ivar list list_of_tasks: A list of tasks created in this executor + :ivar int manager_signal: The most recent manager signal received since manager_poll() was called. + """ def __init__(self, custom_info={}): """Instantiate a new MPIExecutor instance. diff --git a/libensemble/executors/mpi_runner.py b/libensemble/executors/mpi_runner.py index aa6f68c663..30b6b2bf28 100644 --- a/libensemble/executors/mpi_runner.py +++ b/libensemble/executors/mpi_runner.py @@ -18,6 +18,7 @@ def get_runner(mpi_runner_type, runner_name=None): "aprun": APRUN_MPIRunner, "srun": SRUN_MPIRunner, "jsrun": JSRUN_MPIRunner, + "msmpi": MSMPI_MPIRunner, "custom": MPIRunner, } mpi_runner = mpi_runners[mpi_runner_type] @@ -199,6 +200,23 @@ def __init__(self, run_command="aprun"): ] +class MSMPI_MPIRunner(MPIRunner): + def __init__(self, run_command="mpiexec"): + self.run_command = run_command + self.subgroup_launch = False + self.mfile_support = False + self.arg_nprocs = ("-n", "-np") + self.arg_nnodes = ("--LIBE_NNODES_ARG_EMPTY",) + self.arg_ppn = ("-cores",) + self.mpi_command = [ + self.run_command, + "-env {env}", + "-n {num_procs}", + "-cores {procs_per_node}", + "{extra_args}", + ] + + class SRUN_MPIRunner(MPIRunner): def __init__(self, run_command="srun"): self.run_command = run_command diff --git a/libensemble/gen_funcs/old_aposmm.py b/libensemble/gen_funcs/old_aposmm.py index 6c93af2d7f..6510528e42 100644 --- a/libensemble/gen_funcs/old_aposmm.py +++ b/libensemble/gen_funcs/old_aposmm.py @@ -884,7 +884,6 @@ def look_in_history(x, Run_H, vector_return=False): def calc_rk(n, n_s, rk_const, lhs_divisions=0): """ Calculate the critical distance r_k """ - if lhs_divisions == 0: r_k = rk_const*(log(n_s)/n_s)**(1/n) else: diff --git a/libensemble/gen_funcs/persistent_aposmm.py b/libensemble/gen_funcs/persistent_aposmm.py index b82f61ce10..b0a9ae6421 100644 --- a/libensemble/gen_funcs/persistent_aposmm.py +++ b/libensemble/gen_funcs/persistent_aposmm.py @@ -567,7 +567,6 @@ def decide_where_to_start_localopt(H, n, n_s, rk_const, ld=0, mu=0, nu=0): def calc_rk(n, n_s, rk_const, lhs_divisions=0): """ Calculate the critical distance r_k """ - if lhs_divisions == 0: r_k = rk_const*(log(n_s)/n_s)**(1/n) else: diff --git a/libensemble/gen_funcs/persistent_sampling.py b/libensemble/gen_funcs/persistent_sampling.py index 2bfabc1bcd..0cdacfe0a2 100644 --- a/libensemble/gen_funcs/persistent_sampling.py +++ b/libensemble/gen_funcs/persistent_sampling.py @@ -7,7 +7,9 @@ "persistent_uniform", "uniform_random_sample_with_variable_resources", "persistent_request_shutdown", + "uniform_nonblocking", "batched_history_matching", + "persistent_uniform_with_cancellations", ] @@ -204,3 +206,34 @@ def batched_history_matching(H, persis_info, gen_specs, libE_info): Sigma = np.cov(H_o["x"][best_inds].T) return H_o, persis_info, FINISHED_PERSISTENT_GEN_TAG + + +def persistent_uniform_with_cancellations(H, persis_info, gen_specs, libE_info): + + ub = gen_specs["user"]["ub"] + lb = gen_specs["user"]["lb"] + n = len(lb) + b = gen_specs["user"]["initial_batch_size"] + + # Start cancelling points from half initial batch onward + cancel_from = b // 2 # Should get at least this many points back + + ps = PersistentSupport(libE_info, EVAL_GEN_TAG) + + # Send batches until manager sends stop tag + tag = None + while tag not in [STOP_TAG, PERSIS_STOP]: + + H_o = np.zeros(b, dtype=gen_specs["out"]) + H_o["x"] = persis_info["rand_stream"].uniform(lb, ub, (b, n)) + tag, Work, calc_in = ps.send_recv(H_o) + + if hasattr(calc_in, "__len__"): + b = len(calc_in) + + # Cancel as many points as got back + cancel_ids = list(range(cancel_from, cancel_from + b)) + cancel_from += b + ps.request_cancel_sim_ids(cancel_ids) + + return H_o, persis_info, FINISHED_PERSISTENT_GEN_TAG diff --git a/libensemble/gen_funcs/persistent_surmise_calib.py b/libensemble/gen_funcs/persistent_surmise_calib.py index 31842fb3fb..4e8945f7f5 100644 --- a/libensemble/gen_funcs/persistent_surmise_calib.py +++ b/libensemble/gen_funcs/persistent_surmise_calib.py @@ -97,11 +97,7 @@ def cancel_columns(obs_offset, c, n_x, pending, ps): sim_ids_to_cancel.append(sim_id_cancel) pending[i, c] = 0 - # Send only these fields to existing H rows and libEnsemble will slot in the change. - H_o = np.zeros(len(sim_ids_to_cancel), dtype=[("sim_id", int), ("cancel_requested", bool)]) - H_o["sim_id"] = sim_ids_to_cancel - H_o["cancel_requested"] = True - ps.send(H_o) + ps.request_cancel_sim_ids(sim_ids_to_cancel) def assign_priority(n_x, n_thetas): diff --git a/libensemble/gen_funcs/surmise_calib_support.py b/libensemble/gen_funcs/surmise_calib_support.py index b75c8c3c62..c4d475cad3 100644 --- a/libensemble/gen_funcs/surmise_calib_support.py +++ b/libensemble/gen_funcs/surmise_calib_support.py @@ -1,5 +1,4 @@ """Contains supplemental methods for gen function in persistent_surmise_calib.py.""" - import numpy as np import scipy.stats as sps diff --git a/libensemble/history.py b/libensemble/history.py index deabef2982..232a8e3698 100644 --- a/libensemble/history.py +++ b/libensemble/history.py @@ -200,7 +200,7 @@ def update_history_x_in(self, gen_worker, D, safe_mode, gen_started_time): # Ensure there aren't any gaps in the generated sim_id values: assert np.all( np.in1d(np.arange(self.index, np.max(D["sim_id"]) + 1), D["sim_id"]) - ), "The generator function has produced sim_id that are not in order." + ), "The generator function has produced sim_ids that are not in order." num_new = len(np.setdiff1d(D["sim_id"], self.H["sim_id"])) diff --git a/libensemble/libE.py b/libensemble/libE.py index 69afedec36..3df4497834 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -102,7 +102,7 @@ On macOS (since Python 3.8) and Windows, the default multiprocessing start method is ``'spawn'`` and you must place most calling script code (or just ``libE()`` / ``Ensemble().run()`` at a minimum) in -an ``if __name__ == "__main__:" block. +an ``if __name__ == "__main__:"`` block. Therefore a calling script that is universal across all platforms and comms-types may resemble: @@ -142,7 +142,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) -Alternatively, you may set the multiprocesing start method to ``'fork'`` via the following: +Alternatively, you may set the multiprocessing start method to ``'fork'`` via the following: from multiprocessing import set_start_method set_start_method("fork") @@ -297,12 +297,12 @@ def manager( on_cleanup=None, ): """Generic manager routine run.""" - logger.info("Logger initializing: [workerID] precedes each line. [0] = Manager") logger.info("libE version v{}".format(__version__)) if "out" in gen_specs and ("sim_id", int) in gen_specs["out"]: - logger.manager_warning(_USER_SIM_ID_WARNING) + if "libensemble.gen_funcs" not in gen_specs["gen_f"].__module__: + logger.manager_warning(_USER_SIM_ID_WARNING) save_H = libE_specs.get("save_H_and_persis_on_abort", True) @@ -364,7 +364,6 @@ def comms_abort(mpi_comm): def libE_mpi_defaults(libE_specs): """Fill in default values for MPI-based communicators.""" - from mpi4py import MPI if "mpi_comm" not in libE_specs: @@ -375,7 +374,6 @@ def libE_mpi_defaults(libE_specs): def libE_mpi(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): """MPI version of the libE main routine""" - libE_specs, mpi_comm_null = libE_mpi_defaults(libE_specs) if libE_specs["mpi_comm"] == mpi_comm_null: @@ -415,7 +413,6 @@ def libE_mpi(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE def libE_mpi_manager(mpi_comm, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): """Manager routine runs on rank 0.""" - from libensemble.comms.mpi import MainMPIComm hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) @@ -455,7 +452,6 @@ def on_abort(): def libE_mpi_worker(libE_comm, sim_specs, gen_specs, libE_specs): """Worker routines run on ranks > 0.""" - from libensemble.comms.mpi import MainMPIComm comm = MainMPIComm(libE_comm) @@ -468,7 +464,6 @@ def libE_mpi_worker(libE_comm, sim_specs, gen_specs, libE_specs): def start_proc_team(nworkers, sim_specs, gen_specs, libE_specs, log_comm=True): """Launch a process worker team.""" - resources = Resources.resources executor = Executor.executor @@ -493,7 +488,6 @@ def kill_proc_team(wcomms, timeout): def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): """Main routine for thread/process launch of libE.""" - nworkers = libE_specs["nworkers"] check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, H0) @@ -559,7 +553,6 @@ def libE_tcp_default_ID(): def libE_tcp(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): """Main routine for TCP multiprocessing launch of libE.""" - check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, H0) is_worker = True if "workerID" in libE_specs else False @@ -612,7 +605,6 @@ def libE_tcp_start_team(manager, nworkers, workers, ip, port, authkey, launchf): def libE_tcp_mgr(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): """Main routine for TCP multiprocessing launch of libE at manager.""" - hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) # Set up a worker launcher @@ -660,7 +652,6 @@ def cleanup(): def libE_tcp_worker(sim_specs, gen_specs, libE_specs): """Main routine for TCP worker launched by libE.""" - ip = libE_specs["ip"] port = libE_specs["port"] authkey = libE_specs["authkey"] diff --git a/libensemble/manager.py b/libensemble/manager.py index 6cd5d1d853..af3755f83d 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -8,10 +8,12 @@ import glob import logging import socket +import platform import traceback import numpy as np from libensemble.utils.timer import Timer +from libensemble.utils.misc import extract_H_ranges from libensemble.message_numbers import ( EVAL_SIM_TAG, @@ -251,6 +253,8 @@ def _save_every_k(self, fname, count, k): """Saves history every kth step""" count = k * (count // k) filename = fname.format(self.date_start, count) + if platform.system() == "Windows": + filename = filename.replace(":", "-") # ":" is invalid in windows filenames if not os.path.isfile(filename) and count > 0: for old_file in glob.glob(fname.format(self.date_start, "*")): os.remove(old_file) @@ -320,13 +324,11 @@ def _set_resources(self, Work, w): def _freeup_resources(self, w): """Free up resources assigned to the worker""" - if self.resources: self.resources.resource_manager.free_rsets(w) def _send_work_order(self, Work, w): """Sends an allocation function order to a worker""" - logger.debug("Manager sending work unit to worker {}".format(w)) if self.resources: @@ -340,9 +342,7 @@ def _send_work_order(self, Work, w): work_rows = Work["libE_info"]["H_rows"] work_name = calc_type_strings[Work["tag"]] logger.debug( - "Manager sending {} work to worker {}. Rows {}".format( - work_name, w, EnsembleDirectory.extract_H_ranges(Work) or None - ) + "Manager sending {} work to worker {}. Rows {}".format(work_name, w, extract_H_ranges(Work) or None) ) if len(work_rows): if "repack_fields" in globals(): @@ -357,7 +357,6 @@ def _send_work_order(self, Work, w): def _update_state_on_alloc(self, Work, w): """Updates a workers' active/idle status following an allocation order""" - self.W[w - 1]["active"] = Work["tag"] if "libE_info" in Work: if "persistent" in Work["libE_info"]: @@ -415,7 +414,8 @@ def _update_state_on_worker_msg(self, persis_info, D_recv, w): calc_status = D_recv["calc_status"] Manager._check_received_calc(D_recv) - if w not in self.persis_pending and not self.W[w - 1]["active_recv"]: + keep_state = D_recv["libE_info"].get("keep_state", False) + if w not in self.persis_pending and not self.W[w - 1]["active_recv"] and not keep_state: self.W[w - 1]["active"] = 0 if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: @@ -547,7 +547,6 @@ def _sim_max_given(self): def _get_alloc_libE_info(self): """Selected statistics useful for alloc_f""" - return { "any_idle_workers": any(self.W["active"] == 0), "exit_criteria": self.exit_criteria, diff --git a/libensemble/message_numbers.py b/libensemble/message_numbers.py index 7392fd2086..adfcbc2448 100644 --- a/libensemble/message_numbers.py +++ b/libensemble/message_numbers.py @@ -41,6 +41,8 @@ # last_calc_status_rst_tag CALC_EXCEPTION = 35 # Reserved: Automatically used if user_f raised an exception +MAN_KILL_SIGNALS = [MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL] + calc_status_strings = { UNSET_TAG: "Not set", FINISHED_PERSISTENT_SIM_TAG: "Persis sim finished", diff --git a/libensemble/output_directory.py b/libensemble/output_directory.py index da8a46bbc2..f905e037e0 100644 --- a/libensemble/output_directory.py +++ b/libensemble/output_directory.py @@ -1,10 +1,9 @@ import os import re import shutil -from itertools import groupby -from operator import itemgetter from libensemble.utils.loc_stack import LocationStack +from libensemble.utils.misc import extract_H_ranges from libensemble.tools.fields_keys import libE_spec_sim_dir_keys, libE_spec_gen_dir_keys, libE_spec_calc_dir_misc from libensemble.message_numbers import EVAL_SIM_TAG, calc_type_strings @@ -78,32 +77,13 @@ def make_copyback_check(self): def use_calc_dirs(self, type): """Determines calc_dirs enabling for each calc type""" - if type == EVAL_SIM_TAG: return self.sim_use else: return self.gen_use - @staticmethod - def extract_H_ranges(Work): - """Convert received H_rows into ranges for labeling""" - work_H_rows = Work["libE_info"]["H_rows"] - if len(work_H_rows) == 1: - return str(work_H_rows[0]) - else: - # From https://stackoverflow.com/a/30336492 - ranges = [] - for diff, group in groupby(enumerate(work_H_rows.tolist()), lambda x: x[0] - x[1]): - group = list(map(itemgetter(1), group)) - if len(group) > 1: - ranges.append(str(group[0]) + "-" + str(group[-1])) - else: - ranges.append(str(group[0])) - return "_".join(ranges) - def _make_calc_dir(self, workerID, H_rows, calc_str, locs): """Create calc dirs and intermediate dirs, copy inputs, based on libE_specs""" - if calc_str == "sim": input_dir = self.sim_input_dir do_calc_dirs = self.sim_dirs_make @@ -177,12 +157,11 @@ def _make_calc_dir(self, workerID, H_rows, calc_str, locs): def prep_calc_dir(self, Work, calc_iter, workerID, calc_type): """Determines choice for calc_dir structure, then performs calculation.""" - if not self.loc_stack: self.loc_stack = LocationStack() if calc_type == EVAL_SIM_TAG: - H_rows = self.extract_H_ranges(Work) + H_rows = extract_H_ranges(Work) else: H_rows = str(calc_iter[calc_type]) diff --git a/libensemble/resources/mpi_resources.py b/libensemble/resources/mpi_resources.py index 0abd7feb1b..bbe2764d48 100644 --- a/libensemble/resources/mpi_resources.py +++ b/libensemble/resources/mpi_resources.py @@ -36,7 +36,7 @@ def get_MPI_variant(): Returns ------- mpi_variant: string: - MPI variant 'aprun' or 'jsrun' or 'mpich' or 'openmpi' or 'srun' + MPI variant 'aprun' or 'jsrun' or 'msmpi' or 'mpich' or 'openmpi' or 'srun' """ @@ -52,6 +52,14 @@ def get_MPI_variant(): except Exception: pass + try: + try_msmpi = subprocess.Popen(["mpiexec"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout, _ = try_msmpi.communicate() + if "Microsoft" in stdout.decode(): + return "msmpi" + except Exception: + pass + try: # Explore mpi4py.MPI.get_vendor() and mpi4py.MPI.Get_library_version() for mpi4py try_mpich = subprocess.Popen(["mpirun", "-npernode"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) diff --git a/libensemble/resources/node_resources.py b/libensemble/resources/node_resources.py index b76a649b66..cd6771c901 100644 --- a/libensemble/resources/node_resources.py +++ b/libensemble/resources/node_resources.py @@ -48,7 +48,6 @@ def _get_remote_cpu_resources(launcher): def _get_cpu_resources_from_env(env_resources=None): """Returns logical and physical cores per node by querying environment or None""" - if not env_resources: return None diff --git a/libensemble/resources/resources.py b/libensemble/resources/resources.py index e08e735a65..3f35cf509e 100644 --- a/libensemble/resources/resources.py +++ b/libensemble/resources/resources.py @@ -52,13 +52,11 @@ class Resources: @classmethod def init_resources(cls, libE_specs): """Initiate resource management""" - # If disable_resource_manager is True, then Resources.resources will remain None. disable_resource_manager = libE_specs.get("disable_resource_manager", False) if not disable_resource_manager: top_level_dir = os.getcwd() - if Resources.resources is None: - Resources.resources = Resources(libE_specs=libE_specs, top_level_dir=top_level_dir) + Resources.resources = Resources(libE_specs=libE_specs, top_level_dir=top_level_dir) def __init__(self, libE_specs, top_level_dir=None): """Initiate a new resources object""" @@ -164,9 +162,6 @@ def __init__(self, libE_specs, top_level_dir=None): self.num_resource_sets = libE_specs.get("num_resource_sets", None) self.enforce_worker_core_bounds = libE_specs.get("enforce_worker_core_bounds", False) - if self.dedicated_mode: - logger.debug("Running in dedicated mode") - resource_info = libE_specs.get("resource_info", {}) cores_on_node = resource_info.get("cores_on_node", None) node_file = resource_info.get("node_file", None) @@ -228,6 +223,7 @@ def add_comm_info(self, libE_nodes): self.global_nodelist = GlobalResources.remove_nodes(self.global_nodelist, self.libE_nodes) if not self.global_nodelist: logger.warning("Warning. Node-list for tasks is empty. Remove dedicated_mode or add nodes") + pass @staticmethod def is_nodelist_shortnames(nodelist): @@ -259,18 +255,15 @@ def get_global_nodelist(node_file=Resources.DEFAULT_NODEFILE, rundir=None, env_r node_filepath = os.path.join(top_level_dir, node_file) global_nodelist = [] if os.path.isfile(node_filepath): - logger.debug("node_file found - getting nodelist from node_file") with open(node_filepath, "r") as f: for line in f: global_nodelist.append(line.rstrip()) else: - logger.debug("No node_file found - searching for nodelist in environment") if env_resources: global_nodelist = env_resources.get_nodelist() if not global_nodelist: # Assume a standalone machine - logger.info("Can not find nodelist from environment. Assuming standalone") # global_nodelist.append(env_resources.shortnames([socket.gethostname()])[0]) global_nodelist.append(socket.gethostname()) diff --git a/libensemble/resources/scheduler.py b/libensemble/resources/scheduler.py index e27aff6159..89d02f6876 100644 --- a/libensemble/resources/scheduler.py +++ b/libensemble/resources/scheduler.py @@ -165,7 +165,6 @@ def assign_resources(self, rsets_req): def find_rsets_any_slots(self, rsets_by_group, max_grpsize, rsets_req, ngroups, rsets_per_group): """Find optimal non-matching slots across groups""" - tmp_rsets_by_group = copy.deepcopy(rsets_by_group) max_upper_bound = max_grpsize + 1 diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index bfc9f96403..8633f8ef6c 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -65,7 +65,6 @@ def __init__(self, num_workers, resources): def assign_rsets(self, rset_team, worker_id): """Mark the resource sets given by rset_team as assigned to worker_id""" - if rset_team: rteam = self.rsets["assigned"][rset_team] for i, wid in enumerate(rteam): @@ -307,7 +306,6 @@ def set_slot_count(self): @staticmethod def get_local_nodelist(workerID, rset_team, split_list, rsets_per_node): """Returns the list of nodes available to the given worker and the slot dictionary""" - if workerID is None: raise WorkerResourcesException("Worker has no workerID - aborting") diff --git a/libensemble/sim_funcs/borehole_kills.py b/libensemble/sim_funcs/borehole_kills.py index 65a4bbdbd3..1a6fc22c83 100644 --- a/libensemble/sim_funcs/borehole_kills.py +++ b/libensemble/sim_funcs/borehole_kills.py @@ -1,7 +1,7 @@ import numpy as np from libensemble.executors.executor import Executor from libensemble.sim_funcs.surmise_test_function import borehole_true -from libensemble.message_numbers import TASK_FAILED, MAN_SIGNAL_KILL, UNSET_TAG +from libensemble.message_numbers import UNSET_TAG, TASK_FAILED, MAN_KILL_SIGNALS def subproc_borehole(H, delay): @@ -23,7 +23,7 @@ def subproc_borehole(H, delay): task = exctr.submit(app_name="borehole", app_args=args, stdout="out.txt", stderr="err.txt") calc_status = exctr.polling_loop(task, delay=0.01, poll_manager=True) - if calc_status in [MAN_SIGNAL_KILL, TASK_FAILED]: + if calc_status in MAN_KILL_SIGNALS + [TASK_FAILED]: f = np.inf else: f = float(task.read_stdout()) @@ -46,12 +46,15 @@ def borehole(H, persis_info, sim_specs, libE_info): f, calc_status = subproc_borehole(H, delay) - # Failure model (excluding observations) - if sim_id > sim_specs["user"]["num_obs"]: - if (f / borehole_true(H["x"])) > 1.25: - f = np.inf - calc_status = TASK_FAILED - print("Failure of sim_id {}".format(sim_id), flush=True) + if calc_status in MAN_KILL_SIGNALS and "sim_killed" in H_o.dtype.names: + H_o["sim_killed"] = True # For calling script to print only. + else: + # Failure model (excluding observations) + if sim_id > sim_specs["user"]["num_obs"]: + if (f / borehole_true(H["x"])) > 1.25: + f = np.inf + calc_status = TASK_FAILED + print("Failure of sim_id {}".format(sim_id), flush=True) H_o["f"] = f return H_o, persis_info, calc_status diff --git a/libensemble/sim_funcs/branin/branin_obj.py b/libensemble/sim_funcs/branin/branin_obj.py index 87c4e19811..a6189353f2 100644 --- a/libensemble/sim_funcs/branin/branin_obj.py +++ b/libensemble/sim_funcs/branin/branin_obj.py @@ -9,7 +9,6 @@ def call_branin(H, persis_info, sim_specs, _): """Evaluates the Branin function""" - batch = len(H["x"]) H_o = np.zeros(batch, dtype=sim_specs["out"]) diff --git a/libensemble/sim_funcs/executor_hworld.py b/libensemble/sim_funcs/executor_hworld.py index afdf7703eb..f86e15b7ba 100644 --- a/libensemble/sim_funcs/executor_hworld.py +++ b/libensemble/sim_funcs/executor_hworld.py @@ -24,8 +24,7 @@ def custom_polling_loop(exctr, task, timeout_sec=5.0, delay=0.3): while task.runtime < timeout_sec: time.sleep(delay) - exctr.manager_poll() - if exctr.manager_signal == "finish": + if exctr.manager_kill_received(): exctr.kill(task) calc_status = MAN_SIGNAL_FINISH # Worker will pick this up and close down print("Task {} killed by manager on worker {}".format(task.id, exctr.workerID)) @@ -77,101 +76,106 @@ def executor_hworld(H, persis_info, sim_specs, libE_info): wait = False args_for_sim = "sleep 1" + calc_status = UNSET_TAG - if ELAPSED_TIMEOUT: - args_for_sim = "sleep 60" # Manager kill - if signal received else completes - timeout = 65.0 + batch = len(H["x"]) + H_o = np.zeros(batch, dtype=sim_specs["out"]) + + if "six_hump_camel" not in exctr.default_app("sim").full_path: - else: global sim_ended_count sim_ended_count += 1 - timeout = 6.0 - launch_shc = False - print(sim_ended_count) - - if sim_ended_count == 1: - args_for_sim = "sleep 1" # Should finish - elif sim_ended_count == 2: - args_for_sim = "sleep 1 Error" # Worker kill on error - elif sim_ended_count == 3: - wait = True - args_for_sim = "sleep 1" # Should finish - launch_shc = True - elif sim_ended_count == 4: - args_for_sim = "sleep 8" # Worker kill on timeout - timeout = 1.0 - elif sim_ended_count == 5: - args_for_sim = "sleep 2 Fail" # Manager kill - if signal received else completes - - if USE_BALSAM: - task = exctr.submit( - calc_type="sim", - num_procs=cores, - app_args=args_for_sim, - hyperthreads=True, - machinefile="notused", - stdout="notused", - wait_on_start=True, - ) - else: - task = exctr.submit(calc_type="sim", num_procs=cores, app_args=args_for_sim, hyperthreads=True) + print("sim_ended_count", sim_ended_count, flush=True) - if wait: - task.wait() - if not task.finished: - calc_status = UNSET_TAG - if task.state == "FINISHED": - calc_status = WORKER_DONE - elif task.state == "FAILED": - calc_status = TASK_FAILED + if ELAPSED_TIMEOUT: + args_for_sim = "sleep 60" # Manager kill - if signal received else completes + timeout = 65.0 - else: - if not ELAPSED_TIMEOUT: + else: + timeout = 6.0 + launch_shc = False + + if sim_ended_count == 1: + args_for_sim = "sleep 1" # Should finish + elif sim_ended_count == 2: + args_for_sim = "sleep 1 Error" # Worker kill on error + elif sim_ended_count == 3: + wait = True + args_for_sim = "sleep 1" # Should finish + launch_shc = True + elif sim_ended_count == 4: + args_for_sim = "sleep 8" # Worker kill on timeout + timeout = 1.0 + elif sim_ended_count == 5: + args_for_sim = "sleep 2 Fail" # Manager kill - if signal received else completes + + if USE_BALSAM: + task = exctr.submit( + calc_type="sim", + num_procs=cores, + app_args=args_for_sim, + hyperthreads=True, + machinefile="notused", + stdout="notused", + wait_on_start=True, + ) + else: + task = exctr.submit(calc_type="sim", num_procs=cores, app_args=args_for_sim, hyperthreads=True) + + if wait: + task.wait() + if not task.finished: + calc_status = UNSET_TAG + if task.state == "FINISHED": + calc_status = WORKER_DONE + elif task.state == "FAILED": + calc_status = TASK_FAILED + + else: if sim_ended_count >= 2 and not USE_BALSAM: calc_status = exctr.polling_loop(task, timeout=timeout, delay=0.3, poll_manager=True) if sim_ended_count == 2 and task.stdout_exists() and "Error" in task.read_stdout(): calc_status = WORKER_KILL_ON_ERR - else: task, calc_status = custom_polling_loop(exctr, task, timeout) - else: - calc_status = exctr.polling_loop(task, timeout=timeout, delay=0.3, poll_manager=True) - - if USE_BALSAM: - task.read_file_in_workdir("ensemble.log") - try: - task.read_stderr() - except ValueError: - pass - - task = exctr.submit( - app_name="sim_hump_camel_dry_run", - num_procs=cores, - app_args=args_for_sim, - hyperthreads=True, - machinefile="notused", - stdout="notused", - wait_on_start=True, - dry_run=True, - stage_inout=os.getcwd(), - ) - - task.poll() - task.wait() - - # This is temp - return something - so doing six_hump_camel_func again... - batch = len(H["x"]) - H_o = np.zeros(batch, dtype=sim_specs["out"]) - for i, x in enumerate(H["x"]): - H_o["f"][i] = six_hump_camel_func(x) - if launch_shc: - # Test launching a named app. - app_args = " ".join(str(val) for val in list(x[:])) - task = exctr.submit(app_name="six_hump_camel", num_procs=1, app_args=app_args) + if USE_BALSAM: + task.read_file_in_workdir("ensemble.log") + try: + task.read_stderr() + except ValueError: + pass + + task = exctr.submit( + app_name="sim_hump_camel_dry_run", + num_procs=cores, + app_args=args_for_sim, + hyperthreads=True, + machinefile="notused", + stdout="notused", + wait_on_start=True, + dry_run=True, + stage_inout=os.getcwd(), + ) + + task.poll() task.wait() - output = np.float64(task.read_stdout()) - assert np.isclose(H_o["f"][i], output) + + else: + launch_shc = True + calc_status = UNSET_TAG + + # This is temp - return something - so doing six_hump_camel_func again... + for i, x in enumerate(H["x"]): + H_o["f"][i] = six_hump_camel_func(x) + if launch_shc: + # Test launching a named app. + app_args = " ".join(str(val) for val in list(x[:])) + task = exctr.submit(app_name="six_hump_camel", num_procs=1, app_args=app_args) + task.wait() + output = np.float64(task.read_stdout()) + assert np.isclose(H_o["f"][i], output) + calc_status = WORKER_DONE # This is just for testing at calling script level - status of each task H_o["cstat"] = calc_status diff --git a/libensemble/sim_funcs/noisy_vector_mapping.py b/libensemble/sim_funcs/noisy_vector_mapping.py index f9a88553d7..26938d1e8a 100644 --- a/libensemble/sim_funcs/noisy_vector_mapping.py +++ b/libensemble/sim_funcs/noisy_vector_mapping.py @@ -26,7 +26,6 @@ def func_wrapper(H, persis_info, sim_specs, libE_info): def noisy_function(x): """ """ - x1 = x[0] x2 = x[1] term1 = (4 - 2.1 * x1**2 + (x1**4) / 3) * x1**2 diff --git a/libensemble/tests/.coveragerc b/libensemble/tests/.coveragerc index 6156169def..3e1a312766 100644 --- a/libensemble/tests/.coveragerc +++ b/libensemble/tests/.coveragerc @@ -20,7 +20,6 @@ omit = */regression_tests/* */sim_funcs/helloworld.py */sim_funcs/executor_hworld.py - */balsam_executor.py */legacy_balsam_executor.py */forkable_pdb.py */parse_args.py diff --git a/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py b/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py index 062fbf031d..1ff285e0dd 100755 --- a/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py +++ b/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py @@ -1,7 +1,6 @@ #!/usr/bin/env python """ Script to set up apps and jobs for balsam tests """ - # Note: To see use of command line interface see bash_scripts/setup_balsam_tests.sh script. # Currently that script does not create deps between jobs so may run simultaneously # This script tests setup within python (could in theory be integrated with job!) diff --git a/libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor_manager_poll.py b/libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor_manager_poll.py index ccd50e4fe3..e74cfc8ed3 100644 --- a/libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor_manager_poll.py +++ b/libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor_manager_poll.py @@ -12,6 +12,7 @@ import os from libensemble.executors.executor import Executor +from libensemble.message_numbers import MAN_SIGNAL_KILL def build_simfunc(): @@ -68,14 +69,14 @@ def polling_loop(exctr, task, timeout_sec=20.0, delay=2.0): exctr.manager_poll(task) - if task.manager_signal == "kill": + if task.manager_signal == MAN_SIGNAL_KILL: print("Manager has sent kill signal - killing task") exctr.kill(task) # In future might support other manager signals eg: - elif task.manager_signal == "pause": - # checkpoint_task() - pass + # elif task.manager_signal == "pause": + # checkpoint_task() + # pass time.sleep(delay) print("Polling at time", time.time() - start) diff --git a/libensemble/tests/regression_tests/check_libE_stats.py b/libensemble/tests/functionality_tests/check_libE_stats.py similarity index 100% rename from libensemble/tests/regression_tests/check_libE_stats.py rename to libensemble/tests/functionality_tests/check_libE_stats.py diff --git a/libensemble/tests/functionality_tests/test_1d_uniform_sampling_with_comm_dup.py b/libensemble/tests/functionality_tests/test_1d_uniform_sampling_with_comm_dup.py index 987b9eea99..ab81c9ce89 100644 --- a/libensemble/tests/functionality_tests/test_1d_uniform_sampling_with_comm_dup.py +++ b/libensemble/tests/functionality_tests/test_1d_uniform_sampling_with_comm_dup.py @@ -13,6 +13,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi # TESTSUITE_NPROCS: 2 4 +# TESTSUITE_OS_SKIP: WIN import sys import numpy as np diff --git a/libensemble/tests/functionality_tests/test_executor_hworld_pass_fail.py b/libensemble/tests/functionality_tests/test_executor_hworld_pass_fail.py index ccfa4d1e45..90cb7dfd9b 100644 --- a/libensemble/tests/functionality_tests/test_executor_hworld_pass_fail.py +++ b/libensemble/tests/functionality_tests/test_executor_hworld_pass_fail.py @@ -26,9 +26,10 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local tcp -# TESTSUITE_OS_SKIP: OSX +# TESTSUITE_OS_SKIP: OSX WIN # TESTSUITE_NPROCS: 2 3 4 # TESTSUITE_OMPI_SKIP: true +# TESTSUITE_EXTRA: true # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": diff --git a/libensemble/tests/functionality_tests/test_executor_hworld_timeout.py b/libensemble/tests/functionality_tests/test_executor_hworld_timeout.py index e953d23739..fa8bb45d08 100644 --- a/libensemble/tests/functionality_tests/test_executor_hworld_timeout.py +++ b/libensemble/tests/functionality_tests/test_executor_hworld_timeout.py @@ -26,7 +26,8 @@ # TESTSUITE_COMMS: mpi local tcp # TESTSUITE_NPROCS: 2 3 4 # TESTSUITE_OMPI_SKIP: true -# TESTSUITE_OS_SKIP: OSX +# TESTSUITE_OS_SKIP: OSX WIN +# TESTSUITE_EXTRA: true # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": @@ -85,23 +86,31 @@ persis_info = add_unique_random_streams({}, nworkers + 1) - exit_criteria = {"wallclock_max": 30} + exit_criteria = {"wallclock_max": 10} - # Perform the run - H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) + # TCP does not support multiple libE calls + if libE_specs["comms"] == "tcp": + iterations = 1 + else: + iterations = 2 - if is_manager: - print("\nChecking expected task status against Workers ...\n") + for i in range(iterations): + + # Perform the run + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) + + if is_manager: + print("\nChecking expected task status against Workers ...\n") - calc_status_list_in = np.asarray([0]) - calc_status_list = np.repeat(calc_status_list_in, nworkers) + calc_status_list_in = np.asarray([0]) + calc_status_list = np.repeat(calc_status_list_in, nworkers) - # For debug - print("Expecting: {}".format(calc_status_list)) - print("Received: {}\n".format(H["cstat"])) + # For debug + print("Expecting: {}".format(calc_status_list)) + print("Received: {}\n".format(H["cstat"])) - assert np.array_equal(H["cstat"], calc_status_list), "Error - unexpected calc status. Received: " + str( - H["cstat"] - ) + assert np.array_equal(H["cstat"], calc_status_list), "Error - unexpected calc status. Received: " + str( + H["cstat"] + ) - print("\n\n\nRun completed.") + print("\n\n\nRun completed.") diff --git a/libensemble/tests/functionality_tests/test_executor_simple.py b/libensemble/tests/functionality_tests/test_executor_simple.py new file mode 100644 index 0000000000..c09e5037bc --- /dev/null +++ b/libensemble/tests/functionality_tests/test_executor_simple.py @@ -0,0 +1,82 @@ +""" +Runs libEnsemble testing the executor functionality. + +Execute via one of the following commands (e.g. 3 workers): + mpiexec -np 4 python test_executor_hworld.py + python test_executor_hworld.py --nworkers 3 --comms local + python test_executor_hworld.py --nworkers 3 --comms tcp + +The number of concurrent evaluations of the objective function will be 4-1=3. +""" + +import numpy as np + +# Import libEnsemble items for this test +from libensemble.message_numbers import WORKER_DONE +from libensemble.libE import libE +from libensemble.sim_funcs.executor_hworld import executor_hworld as sim_f +import libensemble.sim_funcs.six_hump_camel as six_hump_camel +from libensemble.gen_funcs.sampling import uniform_random_sample as gen_f +from libensemble.tools import parse_args, add_unique_random_streams +from libensemble.executors.mpi_executor import MPIExecutor + + +# Do not change these lines - they are parsed by run-tests.sh +# TESTSUITE_COMMS: mpi local +# TESTSUITE_NPROCS: 4 +# TESTSUITE_OMPI_SKIP: true + +# Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). +if __name__ == "__main__": + + nworkers, is_manager, libE_specs, _ = parse_args() + + cores_per_task = 1 + cores_all_tasks = nworkers * cores_per_task + + sim_app2 = six_hump_camel.__file__ + + exctr = MPIExecutor() + exctr.register_app(full_path=sim_app2, app_name="six_hump_camel", calc_type="sim") # Named app + + sim_specs = { + "sim_f": sim_f, + "in": ["x"], + "out": [("f", float), ("cstat", int)], + "user": {"cores": cores_per_task}, + } + + gen_specs = { + "gen_f": gen_f, + "in": ["sim_id"], + "out": [("x", float, (2,))], + "user": { + "lb": np.array([-3, -2]), + "ub": np.array([3, 2]), + "gen_batch_size": nworkers, + }, + } + + persis_info = add_unique_random_streams({}, nworkers + 1) + + # num sim_ended_count conditions in executor_hworld + exit_criteria = {"sim_max": nworkers * 5} + + # Perform the run + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) + + if is_manager: + print("\nChecking expected task status against Workers ...\n") + + calc_status_list_in = np.asarray([WORKER_DONE] * 5) + calc_status_list = np.repeat(calc_status_list_in, nworkers) + + # For debug + print("Expecting: {}".format(calc_status_list)) + print("Received: {}\n".format(H["cstat"])) + + assert np.array_equal(H["cstat"], calc_status_list), "Error - unexpected calc status. Received: " + str( + H["cstat"] + ) + + print("\n\n\nRun completed.") diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py index 933d7c0544..8bbe372884 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py @@ -7,6 +7,13 @@ mpiexec -np 7 python test_mpi_runners_zrw_subnode_uneven.py python test_mpi_runners_zrw_subnode_uneven.py --nworkers 6 --comms local python test_mpi_runners_zrw_subnode_uneven.py --nworkers 6 --comms tcp + +The resource sets are split unevenly between the two nodes (e.g. 3 and 2). + +Two tests are run. In the first, num_resource_sets is used, and thus the dynamic scheduler. +This will fill node two slots first as there are fewer resource sets on node two, and the +scheduler will preference a smaller space for assigning the task. On the second test, +zero_resource_workers are used, and the static scheduler will fill node one first. """ import sys @@ -36,7 +43,6 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True @@ -45,8 +51,7 @@ logger.set_filename(log_file) # For varying size test - relate node count to nworkers - in_place = libE_specs["zero_resource_workers"] - n_gens = len(in_place) + n_gens = 1 nsim_workers = nworkers - n_gens if nsim_workers % 2 == 0: @@ -96,7 +101,6 @@ } alloc_specs = {"alloc_f": alloc_f, "out": []} - persis_info = add_unique_random_streams({}, nworkers + 1) exit_criteria = {"sim_max": (nsim_workers) * rounds} test_list_base = [ @@ -141,7 +145,22 @@ "persis_gens": n_gens, } - # Perform the run - H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) + iterations = 2 + for prob_id in range(iterations): - # All asserts are in sim func + if prob_id == 0: + # Uses dynamic scheduler - will find node 2 slots first (as fewer) + libE_specs["num_resource_sets"] = nworkers - 1 # Any worker can be the gen + sim_specs["user"]["offset_for_scheduler"] = True # Changes expected values + persis_info = add_unique_random_streams({}, nworkers + 1) + + else: + # Uses static scheduler - will find node 1 slots first + del libE_specs["num_resource_sets"] + libE_specs["zero_resource_workers"] = [1] # Gen must be worker 1 + sim_specs["user"]["offset_for_scheduler"] = False + persis_info = add_unique_random_streams({}, nworkers + 1) + + # Perform the run + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) + # Run-line asserts are in sim func diff --git a/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent.py b/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent.py index 38f27fc1f1..9b1c49a170 100644 --- a/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent.py +++ b/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent.py @@ -36,8 +36,9 @@ nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["zero_resource_workers"] = [1] - num_gens = len(libE_specs["zero_resource_workers"]) + num_gens = 1 + libE_specs["num_resource_sets"] = nworkers - num_gens # Any worker can be the gen + total_nodes = (nworkers - num_gens) // 4 # 4 resourced workers per node. if total_nodes == 1: diff --git a/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py b/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py index eeb507b2be..a8ec5c8596 100644 --- a/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py +++ b/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py @@ -13,6 +13,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local tcp # TESTSUITE_NPROCS: 2 4 +# TESTSUITE_EXTRA: true import numpy as np import os diff --git a/libensemble/tests/regression_tests/test_stats_output.py b/libensemble/tests/functionality_tests/test_stats_output.py similarity index 100% rename from libensemble/tests/regression_tests/test_stats_output.py rename to libensemble/tests/functionality_tests/test_stats_output.py diff --git a/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py b/libensemble/tests/regression_tests/dontrun_test_persistent_gp_multitask_ax.py similarity index 80% rename from libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py rename to libensemble/tests/regression_tests/dontrun_test_persistent_gp_multitask_ax.py index ce8176d258..1d0e2fabd4 100644 --- a/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py +++ b/libensemble/tests/regression_tests/dontrun_test_persistent_gp_multitask_ax.py @@ -26,17 +26,40 @@ from libensemble.tools import save_libE_output, add_unique_random_streams from libensemble.tools import parse_args from libensemble.message_numbers import WORKER_DONE -from libensemble.gen_funcs.persistent_ax_multitask import persistent_gp_mt_ax_gen_f import warnings +# Ax uses a deprecated warn command. +warnings.filterwarnings("ignore", category=UserWarning) +warnings.filterwarnings("ignore", category=DeprecationWarning) + +from libensemble.gen_funcs.persistent_ax_multitask import persistent_gp_mt_ax_gen_f + + +def run_simulation(H, persis_info, sim_specs, libE_info): + # Extract input parameters + values = list(H["x"][0]) + x0 = values[0] + x1 = values[1] + # Extract fidelity parameter + task = H["task"][0] + if task == "expensive_model": + z = 8 + elif task == "cheap_model": + z = 1 + + libE_output = np.zeros(1, dtype=sim_specs["out"]) + calc_status = WORKER_DONE + + # Function that depends on the resolution parameter + libE_output["f"] = -(x0 + 10 * np.cos(x0 + 0.1 * z)) * (x1 + 5 * np.cos(x1 - 0.2 * z)) + + return libE_output, persis_info, calc_status + + # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": - # Ax uses a deprecated warn command. - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - nworkers, is_manager, libE_specs, _ = parse_args() mt_params = { @@ -48,26 +71,6 @@ "n_opt_lofi": 4, } - def run_simulation(H, persis_info, sim_specs, libE_info): - # Extract input parameters - values = list(H["x"][0]) - x0 = values[0] - x1 = values[1] - # Extract fidelity parameter - task = H["task"][0] - if task == "expensive_model": - z = 8 - elif task == "cheap_model": - z = 1 - - libE_output = np.zeros(1, dtype=sim_specs["out"]) - calc_status = WORKER_DONE - - # Function that depends on the resolution parameter - libE_output["f"] = -(x0 + 10 * np.cos(x0 + 0.1 * z)) * (x1 + 5 * np.cos(x1 - 0.2 * z)) - - return libE_output, persis_info, calc_status - sim_specs = { "sim_f": run_simulation, "in": ["x", "task"], diff --git a/libensemble/tests/regression_tests/dont_run_test_persistent_gp.py b/libensemble/tests/regression_tests/run_test_persistent_gp.py similarity index 100% rename from libensemble/tests/regression_tests/dont_run_test_persistent_gp.py rename to libensemble/tests/regression_tests/run_test_persistent_gp.py diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py b/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py index 7d24043446..96b37b58b5 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py @@ -19,7 +19,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: local mpi tcp # TESTSUITE_NPROCS: 4 -# TESTSUITE_OS_SKIP: OSX +# TESTSUITE_OS_SKIP: OSX WIN # TESTSUITE_EXTRA: true import sys diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py b/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py index 6b65de580b..cb58850a0e 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py @@ -14,10 +14,8 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: local mpi tcp # TESTSUITE_NPROCS: 3 -# TESTSUITE_EXTRA: true import sys -import multiprocessing import numpy as np # Import libEnsemble items for this test @@ -38,9 +36,6 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": - # Temporary solution while we investigate/resolve slowdowns with "spawn" start method. - multiprocessing.set_start_method("fork", force=True) - nworkers, is_manager, libE_specs, _ = parse_args() if is_manager: diff --git a/libensemble/tests/regression_tests/dont_run_test_persistent_aposmm_pounders.py b/libensemble/tests/regression_tests/test_persistent_aposmm_pounders.py similarity index 100% rename from libensemble/tests/regression_tests/dont_run_test_persistent_aposmm_pounders.py rename to libensemble/tests/regression_tests/test_persistent_aposmm_pounders.py diff --git a/libensemble/tests/regression_tests/dont_run_test_persistent_aposmm_tao_blmvm.py b/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py similarity index 100% rename from libensemble/tests/regression_tests/dont_run_test_persistent_aposmm_tao_blmvm.py rename to libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py diff --git a/libensemble/tests/regression_tests/dont_run_test_persistent_aposmm_tao_nm.py b/libensemble/tests/regression_tests/test_persistent_aposmm_tao_nm.py similarity index 100% rename from libensemble/tests/regression_tests/dont_run_test_persistent_aposmm_tao_nm.py rename to libensemble/tests/regression_tests/test_persistent_aposmm_tao_nm.py diff --git a/libensemble/tests/regression_tests/test_persistent_sampling_CUDA_variable_resources.py b/libensemble/tests/regression_tests/test_persistent_sampling_CUDA_variable_resources.py index 96ec1279fe..0240460607 100644 --- a/libensemble/tests/regression_tests/test_persistent_sampling_CUDA_variable_resources.py +++ b/libensemble/tests/regression_tests/test_persistent_sampling_CUDA_variable_resources.py @@ -30,7 +30,12 @@ nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["zero_resource_workers"] = [1] + # The persistent gen does not need resources + + libE_specs["num_resource_sets"] = nworkers - 1 # Any worker can be the gen + + # libE_specs["zero_resource_workers"] = [1] # If first worker must be gen, use this instead + libE_specs["sim_dirs_make"] = True libE_specs["ensemble_dir_path"] = "./ensemble_CUDA_variable_w" + str(nworkers) diff --git a/libensemble/tests/regression_tests/test_persistent_sim_uniform_sampling.py b/libensemble/tests/regression_tests/test_persistent_sim_uniform_sampling.py index 8ecaa06247..5cb692a42e 100644 --- a/libensemble/tests/regression_tests/test_persistent_sim_uniform_sampling.py +++ b/libensemble/tests/regression_tests/test_persistent_sim_uniform_sampling.py @@ -15,6 +15,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local tcp # TESTSUITE_NPROCS: 3 4 +# TESTSUITE_OS_SKIP: WIN import sys import numpy as np diff --git a/libensemble/tests/regression_tests/test_persistent_surmise_killsims.py b/libensemble/tests/regression_tests/test_persistent_surmise_killsims.py index 62874e2a7c..510b78ab7a 100644 --- a/libensemble/tests/regression_tests/test_persistent_surmise_killsims.py +++ b/libensemble/tests/regression_tests/test_persistent_surmise_killsims.py @@ -80,7 +80,10 @@ sim_specs = { "sim_f": sim_f, "in": ["x", "thetas"], - "out": [("f", float)], + "out": [ + ("f", float), + ("sim_killed", bool), # "sim_killed" is used only for display at the end of this test + ], "user": { "num_obs": n_x, "init_sample_size": init_sample_size, @@ -130,7 +133,8 @@ if is_manager: print("Cancelled sims", H["sim_id"][H["cancel_requested"]]) - print("Killed sims", H["sim_id"][H["kill_sent"]]) + print("Kills sent by manager to running simulations", H["sim_id"][H["kill_sent"]]) + print("Killed sims", H["sim_id"][H["sim_killed"]]) sims_done = np.count_nonzero(H["sim_ended"]) save_libE_output(H, persis_info, __file__, nworkers) assert sims_done == max_evals, "Num of completed simulations should be {}. Is {}".format(max_evals, sims_done) diff --git a/libensemble/tests/regression_tests/test_persistent_uniform_gen_decides_stop.py b/libensemble/tests/regression_tests/test_persistent_uniform_gen_decides_stop.py index 164aaa62e2..96b9af52fa 100644 --- a/libensemble/tests/regression_tests/test_persistent_uniform_gen_decides_stop.py +++ b/libensemble/tests/regression_tests/test_persistent_uniform_gen_decides_stop.py @@ -14,6 +14,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local tcp # TESTSUITE_NPROCS: 5 +# TESTSUITE_OS_SKIP: WIN import sys import numpy as np diff --git a/libensemble/tests/regression_tests/test_persistent_uniform_sampling_cancel.py b/libensemble/tests/regression_tests/test_persistent_uniform_sampling_cancel.py new file mode 100644 index 0000000000..347308cf55 --- /dev/null +++ b/libensemble/tests/regression_tests/test_persistent_uniform_sampling_cancel.py @@ -0,0 +1,74 @@ +""" +Tests libEnsemble with a simple persistent uniform sampling generator +function. + +Execute via one of the following commands (e.g. 3 workers): + mpiexec -np 4 python test_persistent_sampling.py + python test_persistent_uniform_sampling_cancel.py --nworkers 3 --comms local + python test_persistent_uniform_sampling_cancel.py --nworkers 3 --comms tcp + +When running with the above commands, the number of concurrent evaluations of +the objective function will be 2, as one of the three workers will be the +persistent generator. +""" + +# Do not change these lines - they are parsed by run-tests.sh +# TESTSUITE_COMMS: mpi local +# TESTSUITE_NPROCS: 3 4 + +import sys +import numpy as np + +# Import libEnsemble items for this test +from libensemble.libE import libE +from libensemble.sim_funcs.rosenbrock import rosenbrock_eval as sim_f +from libensemble.gen_funcs.persistent_sampling import persistent_uniform_with_cancellations as gen_f +from libensemble.alloc_funcs.start_only_persistent import only_persistent_gens as alloc_f +from libensemble.tools import parse_args, save_libE_output, add_unique_random_streams + +# Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). +if __name__ == "__main__": + + nworkers, is_manager, libE_specs, _ = parse_args() + + if nworkers < 2: + sys.exit("Cannot run with a persistent worker if only one worker -- aborting...") + + n = 2 + + sim_specs = { + "sim_f": sim_f, + "in": ["x"], + "out": [("f", float), ("grad", float, n)], + } + + gen_specs = { + "gen_f": gen_f, + "persis_in": ["x", "f", "grad", "sim_id"], + "out": [("x", float, (n,))], + "user": { + "initial_batch_size": 100, + "lb": np.array([-3, -2]), + "ub": np.array([3, 2]), + }, + } + + alloc_specs = { + "alloc_f": alloc_f, + "user": {"async_return": True}, + } + + exit_criteria = {"gen_max": 150, "wallclock_max": 300} + + persis_info = add_unique_random_streams({}, nworkers + 1) + + # Perform the run + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) + + if is_manager: + + # For reproducible test, only tests if cancel requested on points - not whether got evaluated + assert np.all(H["cancel_requested"][:49] == False), "Values cancelled which should not be" # noqa: E712 + assert np.all(H["cancel_requested"][50:100]), "Values not cancelled which should be" + + save_libE_output(H, persis_info, __file__, nworkers) diff --git a/libensemble/tests/regression_tests/test_uniform_sampling_cancel.py b/libensemble/tests/regression_tests/test_uniform_sampling_cancel.py index 2dd4f66877..83d25bfb24 100644 --- a/libensemble/tests/regression_tests/test_uniform_sampling_cancel.py +++ b/libensemble/tests/regression_tests/test_uniform_sampling_cancel.py @@ -34,7 +34,6 @@ def create_H0(persis_info, gen_specs, sim_max): """Create an H0 for give_pregenerated_sim_work""" - # Manually creating H0 ub = gen_specs["user"]["ub"] lb = gen_specs["user"]["lb"] diff --git a/libensemble/tests/regression_tests/test_uniform_sampling_with_variable_resources.py b/libensemble/tests/regression_tests/test_uniform_sampling_with_variable_resources.py index edc3ccbcbf..9c9804da04 100644 --- a/libensemble/tests/regression_tests/test_uniform_sampling_with_variable_resources.py +++ b/libensemble/tests/regression_tests/test_uniform_sampling_with_variable_resources.py @@ -14,6 +14,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local # TESTSUITE_NPROCS: 2 4 +# TESTSUITE_EXTRA: true import sys import numpy as np diff --git a/libensemble/tests/run-tests.sh b/libensemble/tests/run-tests.sh index 0b506f4858..12fb2b72e9 100755 --- a/libensemble/tests/run-tests.sh +++ b/libensemble/tests/run-tests.sh @@ -92,24 +92,6 @@ print_summary_line() { done } -#Get current time in seconds -#In: Nothing -#Out: Returns time in seconds (seconds since 1970-01-01 00:00:00 UTC) as a string -# Or if bc not available uses SECONDS (whole seconds that script has been running) -current_time() { - local time - #Is bc present - USE_BC=f - bc --version >> /dev/null && USE_BC=t - if [ $USE_BC = 't' ]; then - #time=$(date +%s.%N) - time=$(python -c 'import time; print(time.time())') - else - time=$SECONDS - fi; - echo "$time" -} - #Return a time difference #In: Start and End times as strings #Out: Time difference as a string @@ -339,6 +321,24 @@ fi; PYTHON_RUN="python$PYTHON_VER $PYTHON_FLAGS" echo -e "Python run: $PYTHON_RUN" +#Get current time in seconds +#In: Nothing +#Out: Returns time in seconds (seconds since 1970-01-01 00:00:00 UTC) as a string +# Or if bc not available uses SECONDS (whole seconds that script has been running) +current_time() { + local time + #Is bc present + USE_BC=f + bc --version >> /dev/null && USE_BC=t + if [ $USE_BC = 't' ]; then + #time=$(date +%s.%N) + time=$($PYTHON_RUN -c 'import time; print(time.time())') + else + time=$SECONDS + fi; + echo "$time" +} + textreset=$(tput sgr0) fail_color=$(tput bold; tput setaf 1) #red pass_color=$(tput bold; tput setaf 2) #green @@ -500,11 +500,16 @@ if [ "$root_found" = true ]; then if [ "$RUN_LOCAL" = true ] && [ "$LAUNCHER" = local ]; then RUN_TEST=true; fi if [ "$RUN_TCP" = true ] && [ "$LAUNCHER" = tcp ]; then RUN_TEST=true; fi - if [[ "$OSTYPE" = *"darwin"* ]] && [[ "$OS_SKIP_LIST" = "OSX" ]]; then + if [[ "$OSTYPE" = *"darwin"* ]] && [[ "$OS_SKIP_LIST" = *"OSX"* ]]; then echo "Skipping test number for OSX: " $test_num continue fi + if [[ "$OSTYPE" = *"msys"* ]] && [[ "$OS_SKIP_LIST" = *"WIN"* ]]; then + echo "Skipping test number for Windows: " $test_num + continue + fi + if [[ "$OMPI_SKIP" = "true" ]] && [[ "$MPIEXEC_FLAGS" = "--oversubscribe" ]] && [[ "$RUN_MPI" = true ]]; then echo "Skipping test number for Open MPI: " $test_num continue diff --git a/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh b/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh index 3615a076ec..cd9f5914f2 100755 --- a/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh +++ b/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh @@ -36,7 +36,7 @@ mpicc -O3 -o forces.x forces.c -lm # xlc_r -O3 -qsmp=omp -qoffload -o forces.x forces.c # Nvidia (nvc) compiler with mpicc and on Cray system with target (Perlmutter) -# mpicc -O3 -fopenmp -mp=gpu -o forces_gpu.x forces_gpu.c +# mpicc -O3 -fopenmp -mp=gpu -o forces.x forces.c # cc -O3 -fopenmp -mp=gpu -target-accel=nvidia80 -o forces.x forces.c # Spock/Crusher (AMD ROCm compiler) diff --git a/libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces.py b/libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces.py index 3c4a296706..0d4bc23bad 100644 --- a/libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces.py +++ b/libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces.py @@ -9,47 +9,49 @@ from libensemble.tools import parse_args, add_unique_random_streams from libensemble.executors import MPIExecutor -# Parse number of workers, comms type, etc. from arguments -nworkers, is_manager, libE_specs, _ = parse_args() - -# Initialize MPI Executor instance -exctr = MPIExecutor() - -# Register simulation executable with executor -sim_app = os.path.join(os.getcwd(), "../forces_app/forces.x") - -if not os.path.isfile(sim_app): - sys.exit("forces.x not found - please build first in ../forces_app dir") - -exctr.register_app(full_path=sim_app, app_name="forces") - -# State the sim_f, inputs, outputs -sim_specs = { - "sim_f": run_forces, # sim_f, imported above - "in": ["x"], # Name of input for sim_f - "out": [("energy", float)], # Name, type of output from sim_f -} - -# State the gen_f, inputs, outputs, additional parameters -gen_specs = { - "gen_f": uniform_random_sample, # Generator function - "in": [], # Generator input - "out": [("x", float, (1,))], # Name, type and size of data from gen_f - "user": { - "lb": np.array([1000]), # User parameters for the gen_f - "ub": np.array([3000]), - "gen_batch_size": 8, - }, -} - -# Create and work inside separate per-simulation directories -libE_specs["sim_dirs_make"] = True - -# Instruct libEnsemble to exit after this many simulations -exit_criteria = {"sim_max": 8} - -# Seed random streams for each worker, particularly for gen_f -persis_info = add_unique_random_streams({}, nworkers + 1) - -# Launch libEnsemble -H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info=persis_info, libE_specs=libE_specs) +if __name__ == "__main__": + + # Parse number of workers, comms type, etc. from arguments + nworkers, is_manager, libE_specs, _ = parse_args() + + # Initialize MPI Executor instance + exctr = MPIExecutor() + + # Register simulation executable with executor + sim_app = os.path.join(os.getcwd(), "../forces_app/forces.x") + + if not os.path.isfile(sim_app): + sys.exit("forces.x not found - please build first in ../forces_app dir") + + exctr.register_app(full_path=sim_app, app_name="forces") + + # State the sim_f, inputs, outputs + sim_specs = { + "sim_f": run_forces, # sim_f, imported above + "in": ["x"], # Name of input for sim_f + "out": [("energy", float)], # Name, type of output from sim_f + } + + # State the gen_f, inputs, outputs, additional parameters + gen_specs = { + "gen_f": uniform_random_sample, # Generator function + "in": [], # Generator input + "out": [("x", float, (1,))], # Name, type and size of data from gen_f + "user": { + "lb": np.array([1000]), # User parameters for the gen_f + "ub": np.array([3000]), + "gen_batch_size": 8, + }, + } + + # Create and work inside separate per-simulation directories + libE_specs["sim_dirs_make"] = True + + # Instruct libEnsemble to exit after this many simulations + exit_criteria = {"sim_max": 8} + + # Seed random streams for each worker, particularly for gen_f + persis_info = add_unique_random_streams({}, nworkers + 1) + + # Launch libEnsemble + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info=persis_info, libE_specs=libE_specs) diff --git a/libensemble/tests/unit_tests/test_executor.py b/libensemble/tests/unit_tests/test_executor.py index 5b9e52fecc..9f41be02cf 100644 --- a/libensemble/tests/unit_tests/test_executor.py +++ b/libensemble/tests/unit_tests/test_executor.py @@ -7,10 +7,13 @@ import time import pytest import socket -import mpi4py +import platform -mpi4py.rc.initialize = False -from mpi4py import MPI +if platform.system() != "Windows": + import mpi4py + + mpi4py.rc.initialize = False + from mpi4py import MPI from libensemble.resources.mpi_resources import MPIResourcesException from libensemble.executors.executor import Executor, ExecutorException, TimeoutExpired @@ -202,6 +205,9 @@ def polling_loop_multitask(exctr, task_list, timeout_sec=4.0, delay=0.05): # Tests ======================================================================================== + + +@pytest.mark.extra def test_launch_and_poll(): """Test of launching and polling task and exiting on task finish""" print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) @@ -216,6 +222,7 @@ def test_launch_and_poll(): assert task.run_attempts == 1, "task.run_attempts should be 1. Returned " + str(task.run_attempts) +@pytest.mark.extra def test_launch_and_wait(): """Test of launching and waiting on task""" print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) @@ -232,6 +239,7 @@ def test_launch_and_wait(): assert task.state == "FINISHED", "task.state should be FINISHED. Returned " + str(task.state) +@pytest.mark.extra def test_launch_and_wait_timeout(): """Test of launching and waiting on task timeout (and kill)""" print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) @@ -251,6 +259,7 @@ def test_launch_and_wait_timeout(): assert task.state == "USER_KILLED", "task.state should be USER_KILLED. Returned " + str(task.state) +@pytest.mark.extra def test_launch_wait_on_start(): """Test of launching task with wait_on_start""" print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) @@ -267,6 +276,7 @@ def test_launch_wait_on_start(): assert task.state == "FINISHED", "task.state should be FINISHED. Returned " + str(task.state) +@pytest.mark.extra def test_kill_on_file(): """Test of killing task based on something in output file""" print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) @@ -280,6 +290,7 @@ def test_kill_on_file(): assert task.state == "USER_KILLED", "task.state should be USER_KILLED. Returned " + str(task.state) +@pytest.mark.extra def test_kill_on_timeout(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() @@ -292,6 +303,7 @@ def test_kill_on_timeout(): assert task.state == "USER_KILLED", "task.state should be USER_KILLED. Returned " + str(task.state) +@pytest.mark.extra def test_kill_on_timeout_polling_loop_method(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() @@ -304,6 +316,7 @@ def test_kill_on_timeout_polling_loop_method(): assert task.state == "USER_KILLED", "task.state should be USER_KILLED. Returned " + str(task.state) +@pytest.mark.extra def test_launch_and_poll_multitasks(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() @@ -326,6 +339,7 @@ def test_launch_and_poll_multitasks(): assert task.state == "FINISHED", "task.state should be FINISHED. Returned " + str(task.state) +@pytest.mark.extra def test_get_task(): """Return task from given task id""" print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) @@ -351,6 +365,7 @@ def test_get_task(): assert A is None, "Task found when supplied taskid should not exist" +@pytest.mark.extra @pytest.mark.timeout(30) def test_procs_and_machinefile_logic(): """Test of supplying various input configurations.""" @@ -437,6 +452,7 @@ def test_procs_and_machinefile_logic(): assert task.state == "FINISHED", "task.state should be FINISHED. Returned " + str(task.state) +@pytest.mark.extra @pytest.mark.timeout(20) def test_doublekill(): """Test attempt to kill already killed task @@ -460,6 +476,7 @@ def test_doublekill(): assert task.state == "USER_KILLED", "task.state should be USER_KILLED. Returned " + str(task.state) +@pytest.mark.extra @pytest.mark.timeout(20) def test_finish_and_kill(): """Test attempt to kill already finished task @@ -486,6 +503,7 @@ def test_finish_and_kill(): assert task.state == "FINISHED", "task.state should be FINISHED. Returned " + str(task.state) +@pytest.mark.extra @pytest.mark.timeout(20) def test_launch_and_kill(): """Test launching and immediately killing tasks with no poll""" @@ -506,6 +524,7 @@ def test_launch_and_kill(): assert task.state == "USER_KILLED", "task.state should be USER_KILLED. Returned " + str(task.state) +@pytest.mark.extra def test_launch_as_gen(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() @@ -536,6 +555,7 @@ def test_launch_as_gen(): assert 0 +@pytest.mark.extra def test_launch_no_app(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor_noapp() @@ -556,6 +576,7 @@ def test_launch_no_app(): assert 0 +@pytest.mark.extra def test_kill_task_with_no_submit(): from libensemble.executors.executor import Task @@ -586,6 +607,7 @@ def test_kill_task_with_no_submit(): assert 0 +@pytest.mark.extra def test_poll_task_with_no_submit(): from libensemble.executors.executor import Task @@ -606,6 +628,7 @@ def test_poll_task_with_no_submit(): assert 0 +@pytest.mark.extra def test_task_failure(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() @@ -618,6 +641,7 @@ def test_task_failure(): assert task.state == "FAILED", "task.state should be FAILED. Returned " + str(task.state) +@pytest.mark.extra def test_retries_launch_fail(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor_fakerunner() @@ -631,6 +655,7 @@ def test_retries_launch_fail(): assert task.run_attempts == 5, "task.run_attempts should be 5. Returned " + str(task.run_attempts) +@pytest.mark.extra def test_retries_run_fail(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() @@ -644,6 +669,7 @@ def test_retries_run_fail(): assert task.run_attempts == 5, "task.run_attempts should be 5. Returned " + str(task.run_attempts) +@pytest.mark.extra def test_register_apps(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() # This registers an app my_simtask.x (default sim) @@ -689,6 +715,7 @@ def test_register_apps(): # assert e.args[1] == "Registered applications: ['my_simtask.x', 'fake_app1', 'fake_app2']" +@pytest.mark.extra def test_serial_exes(): setup_serial_executor() exctr = Executor.executor @@ -699,6 +726,7 @@ def test_serial_exes(): assert task.state == "FINISHED", "task.state should be FINISHED. Returned " + str(task.state) +@pytest.mark.extra def test_serial_startup_times(): setup_executor_startups() exctr = Executor.executor @@ -724,6 +752,7 @@ def test_serial_startup_times(): assert 0 < startup_time < 1, "Start up time for python program took " + str(startup_time) +@pytest.mark.extra def test_futures_interface(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() @@ -737,6 +766,7 @@ def test_futures_interface(): assert task.done(), "task.done() should return True after task finishes." +@pytest.mark.extra def test_futures_interface_cancel(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_executor() @@ -749,6 +779,7 @@ def test_futures_interface_cancel(): assert task.cancelled() and task.done(), "Task should be both cancelled() and done() after cancellation." +@pytest.mark.extra def test_dry_run(): """Test of dry_run in poll""" print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) @@ -761,9 +792,9 @@ def test_dry_run(): task.kill() +@pytest.mark.extra def test_non_existent_app(): """Tests exception on non-existent app""" - from libensemble.executors.executor import Executor exctr = Executor() @@ -781,9 +812,9 @@ def test_non_existent_app(): assert 0 +@pytest.mark.extra def test_non_existent_app_mpi(): """Tests exception on non-existent app""" - from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() diff --git a/libensemble/tests/unit_tests/test_executor_balsam.py b/libensemble/tests/unit_tests/test_executor_balsam.py new file mode 100644 index 0000000000..8c64f57d23 --- /dev/null +++ b/libensemble/tests/unit_tests/test_executor_balsam.py @@ -0,0 +1,251 @@ +# !/usr/bin/env python +# Integration Test of executor module for libensemble +# Test does not require running full libensemble +import os +import sys +import mock +import pytest +import datetime +from dataclasses import dataclass + +from libensemble.executors.executor import ( + Executor, + Application, + ExecutorException, + TimeoutExpired, +) + + +# fake Balsam app +class TestLibeApp: + site = "libe-unit-test" + command_template = "python simdir/py_startup.py" + + def sync(): + pass + + +# fake EventLog object +@dataclass +class LogEventTest: + timestamp: datetime.datetime = None + + +def setup_module(module): + try: + print("setup_module module:%s" % module.__name__) + except AttributeError: + print("setup_module (direct run) module:%s" % module) + if Executor.executor is not None: + del Executor.executor + Executor.executor = None + + +def teardown_module(module): + try: + print("teardown_module module:%s" % module.__name__) + except AttributeError: + print("teardown_module (direct run) module:%s" % module) + if Executor.executor is not None: + del Executor.executor + Executor.executor = None + + +# This would typically be in the user calling script +def setup_executor(): + """Set up a Balsam Executor with sim app""" + from libensemble.executors.balsam_executors import BalsamExecutor + + exctr = BalsamExecutor() # noqa F841 + + +# Tests ======================================================================================== + + +@pytest.mark.extra +def test_register_app(): + """Test of registering an App""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + setup_executor() + exctr = Executor.executor + + exctr.serial_setup() # does nothing, compatibility with legacy-balsam-exctr + exctr.add_app("hello", "world") # does nothing, compatibility with legacy-balsam-exctr + exctr.set_resources("hello") # does nothing, compatibility with other executors + + exctr.register_app(TestLibeApp, calc_type="sim", precedent="fake/dir") + assert isinstance( + exctr.apps["python"], Application + ), "Application object not created based on registered Balsam AppDef" + + exctr.register_app(TestLibeApp, app_name="test") + assert isinstance( + exctr.apps["test"], Application + ), "Application object not created based on registered Balsam AppDef" + + +@pytest.mark.extra +def test_submit_app_defaults(): + """Test of submitting an App""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + exctr = Executor.executor + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.Job"): + task = exctr.submit(calc_type="sim") + task = exctr.submit(app_name="test") + + assert task in exctr.list_of_tasks, "new task not added to executor's list of tasks" + + assert task == exctr.get_task(task.id), "task retrieved via task ID doesn't match new task" + + with pytest.raises(ExecutorException): + task = exctr.submit() + pytest.fail("Expected exception") + + +@pytest.mark.extra +def test_submit_app_workdir(): + """Test of submitting an App with a workdir""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + exctr = Executor.executor + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.Job"): + task = exctr.submit(calc_type="sim", workdir="output", machinefile="nope") + + assert task.workdir == os.path.join(exctr.workflow_name, "output"), "workdir not properly defined for new task" + + +@pytest.mark.extra +def test_submit_app_dry(): + """Test of dry-run submitting an App""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + exctr = Executor.executor + task = exctr.submit(calc_type="sim", dry_run=True) + task.poll() + + assert all([task.dry_run, task.done()]), "new task from dry_run wasn't marked as such, or set as done" + + +@pytest.mark.extra +def test_submit_app_wait(): + """Test of exctr.submit blocking until app is running""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + exctr = Executor.executor + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.Job") as job: + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.EventLog") as log: + job.return_value.state = "RUNNING" + log.objects.filter.return_value = [ + LogEventTest(timestamp=datetime.datetime(2022, 4, 21, 20, 29, 33, 455144)) + ] + task = exctr.submit(calc_type="sim", wait_on_start=True) + assert task.running(), "new task is not marked as running after wait_on_start" + + log.objects.filter.return_value = [LogEventTest(timestamp=None)] + task = exctr.submit(calc_type="sim", wait_on_start=True) + assert task.runtime == 0, "runtime should be 0 without Balsam timestamp evaluated" + + +@pytest.mark.extra +def test_submit_revoke_alloc(): + """Test creating and revoking BatchJob objects through the executor""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + exctr = Executor.executor + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.BatchJob"): + alloc = exctr.submit_allocation(site_id="libe-unit-test", num_nodes=1, wall_time_min=30) + + assert alloc in exctr.allocations, "batchjob object not appended to executor's list of allocations" + + alloc.scheduler_id = None + assert not exctr.revoke_allocation( + alloc, timeout=3 + ), "unable to revoke allocation if Balsam never returns scheduler ID" + + alloc.scheduler_id = 1 + assert exctr.revoke_allocation( + alloc, timeout=3 + ), "should've been able to revoke allocation if scheduler ID available" + + +@pytest.mark.extra +def test_task_poll(): + """Test of killing (cancelling) a balsam app""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + exctr = Executor.executor + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.Job") as job: + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.EventLog"): + task = exctr.submit(calc_type="sim") + + job.return_value.state = "PREPROCESSED" + task.poll() + assert task.state == "WAITING", "task should've been considered waiting based on balsam state" + + job.return_value.state = "FAILED" + task.poll() + assert task.state == "FAILED", "task should've been considered failed based on balsam state" + + task = exctr.submit(calc_type="sim") + + job.return_value.state = "JOB_FINISHED" + task.poll() + assert task.state == "FINISHED", "task was not finished after wait method" + + assert not task.running(), "task shouldn't be running after wait method returns" + + assert task.done(), "task should be 'done' after wait method" + + +@pytest.mark.extra +def test_task_wait(): + """Test of killing (cancelling) a balsam app""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + exctr = Executor.executor + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.Job") as job: + with mock.patch( + "libensemble.executors.balsam_executors.balsam_executor.EventLog" + ): # need to patch since wait polls + task = exctr.submit(calc_type="sim") + + job.return_value.state = "RUNNING" + with pytest.raises(TimeoutExpired): + task.wait(timeout=3) + pytest.fail("Expected exception") + + job.return_value.state = "JOB_FINISHED" + task.wait(timeout=3) + task.wait(timeout=3) # should return immediately since self._check_poll() should return False + assert task.state == "FINISHED", "task was not finished after wait method" + assert not task.running(), "task shouldn't be running after wait method returns" + assert task.done(), "task should be 'done' after wait method" + + task = exctr.submit(calc_type="sim", dry_run=True) + task.wait() # should also return immediately since dry_run + + task = exctr.submit(calc_type="sim") + job.return_value.state = "FAILED" + task.wait(timeout=3) + assert task.state == "FAILED", "Matching Balsam state should've been assigned to task" + + +@pytest.mark.extra +def test_task_kill(): + """Test of killing (cancelling) a balsam app""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + exctr = Executor.executor + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.Job"): + task = exctr.submit(calc_type="sim") + + with mock.patch("libensemble.executors.balsam_executors.balsam_executor.EventLog"): + task.kill() + assert task.finished and task.state == "USER_KILLED", "task not set as killed after kill method" + + +if __name__ == "__main__": + setup_module(__file__) + test_register_app() + test_submit_app_defaults() + test_submit_app_workdir() + test_submit_app_dry() + test_submit_app_wait() + test_submit_revoke_alloc() + test_task_poll() + test_task_wait() + test_task_kill() + teardown_module(__file__) diff --git a/libensemble/tests/unit_tests/test_launcher.py b/libensemble/tests/unit_tests/test_launcher.py index d19dd5a309..951eb287dc 100644 --- a/libensemble/tests/unit_tests/test_launcher.py +++ b/libensemble/tests/unit_tests/test_launcher.py @@ -5,6 +5,7 @@ """ import sys +import pytest import libensemble.utils.launcher as launcher @@ -59,6 +60,7 @@ def xtest_submit(): launcher.cancel(process, 0) +@pytest.mark.extra def test_launch32(): "If we are in Python > 3.2, still check that 3.2 wait func works" saved_wait = launcher.wait @@ -67,6 +69,7 @@ def test_launch32(): launcher.wait = saved_wait +@pytest.mark.extra def test_launch33(): "If we are in Python > 3.2, also check the new-style wait func" if launcher.wait == launcher.wait_py33: diff --git a/libensemble/tests/unit_tests/test_libE_main.py b/libensemble/tests/unit_tests/test_libE_main.py index 0e183e4258..b8cbd70c0e 100644 --- a/libensemble/tests/unit_tests/test_libE_main.py +++ b/libensemble/tests/unit_tests/test_libE_main.py @@ -253,6 +253,7 @@ def test_checking_inputs_single(): check_inputs(libE_specs=libE_specs) +@pytest.mark.extra def test_logging_disabling(): remove_file_if_exists("ensemble.log") remove_file_if_exists("libE_stats.txt") diff --git a/libensemble/tests/unit_tests/test_make_runners.py b/libensemble/tests/unit_tests/test_make_runners.py new file mode 100644 index 0000000000..0628ca88af --- /dev/null +++ b/libensemble/tests/unit_tests/test_make_runners.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest +import mock + +import libensemble.tests.unit_tests.setup as setup +from libensemble.tools.fields_keys import libE_fields +from libensemble.message_numbers import EVAL_SIM_TAG, EVAL_GEN_TAG +from libensemble.utils.runners import Runners + + +def get_ufunc_args(): + sim_specs, gen_specs, exit_criteria = setup.make_criteria_and_specs_0() + + L = exit_criteria["sim_max"] + H = np.zeros(L, dtype=list(set(libE_fields + sim_specs["out"] + gen_specs["out"]))) + + H["sim_id"][-L:] = -1 + H["sim_started_time"][-L:] = np.inf + + sim_ids = np.zeros(1, dtype=int) + Work = { + "tag": EVAL_SIM_TAG, + "persis_info": {}, + "libE_info": {"H_rows": sim_ids}, + "H_fields": sim_specs["in"], + } + calc_in = H[Work["H_fields"]][Work["libE_info"]["H_rows"]] + return calc_in, sim_specs, gen_specs + + +@pytest.mark.extra +def test_normal_runners(): + calc_in, sim_specs, gen_specs = get_ufunc_args() + + runners = Runners(sim_specs, gen_specs) + assert ( + not runners.has_funcx_sim and not runners.has_funcx_gen + ), "funcX use should not be detected without setting endpoint fields" + + ro = runners.make_runners() + assert all( + [i in ro for i in [EVAL_SIM_TAG, EVAL_GEN_TAG]] + ), "Both user function tags should be included in runners dictionary" + + +@pytest.mark.extra +def test_normal_no_gen(): + calc_in, sim_specs, gen_specs = get_ufunc_args() + + runners = Runners(sim_specs, {}) + ro = runners.make_runners() + + assert not ro[2], "generator function shouldn't be provided if not using gen_specs" + + +@pytest.mark.extra +def test_funcx_runner_init(): + calc_in, sim_specs, gen_specs = get_ufunc_args() + + sim_specs["funcx_endpoint"] = "1234" + + with mock.patch("funcx.FuncXClient"): + + runners = Runners(sim_specs, gen_specs) + + assert ( + runners.funcx_exctr is not None + ), "FuncXExecutor should have been instantiated when funcx_endpoint found in specs" + + +@pytest.mark.extra +def test_funcx_runner_pass(): + calc_in, sim_specs, gen_specs = get_ufunc_args() + + sim_specs["funcx_endpoint"] = "1234" + + with mock.patch("funcx.FuncXClient"): + + runners = Runners(sim_specs, gen_specs) + + # Creating Mock funcXExecutor and funcX future object - no exception + funcx_mock = mock.Mock() + funcx_future = mock.Mock() + funcx_mock.submit.return_value = funcx_future + funcx_future.exception.return_value = None + funcx_future.result.return_value = (True, True) + + runners.funcx_exctr = funcx_mock + ro = runners.make_runners() + + libE_info = {"H_rows": np.array([2, 3, 4]), "workerID": 1, "comm": "fakecomm"} + out, persis_info = ro[1](calc_in, {}, libE_info) + + assert all([out, persis_info]), "funcX runner correctly returned results" + + +@pytest.mark.extra +def test_funcx_runner_fail(): + calc_in, sim_specs, gen_specs = get_ufunc_args() + + gen_specs["funcx_endpoint"] = "4321" + + with mock.patch("funcx.FuncXClient"): + + runners = Runners(sim_specs, gen_specs) + + # Creating Mock funcXExecutor and funcX future object - yes exception + funcx_mock = mock.Mock() + funcx_future = mock.Mock() + funcx_mock.submit.return_value = funcx_future + funcx_future.exception.return_value = Exception + + runners.funcx_exctr = funcx_mock + ro = runners.make_runners() + + libE_info = {"H_rows": np.array([2, 3, 4]), "workerID": 1, "comm": "fakecomm"} + + with pytest.raises(Exception): + out, persis_info = ro[2](calc_in, {}, libE_info) + pytest.fail("Expected exception") + + +if __name__ == "__main__": + test_normal_runners() + test_normal_no_gen() + test_funcx_runner_init() + test_funcx_runner_pass() + test_funcx_runner_fail() diff --git a/libensemble/tests/unit_tests/test_manager_main.py b/libensemble/tests/unit_tests/test_manager_main.py index 0d9404f91e..a75d320fad 100644 --- a/libensemble/tests/unit_tests/test_manager_main.py +++ b/libensemble/tests/unit_tests/test_manager_main.py @@ -1,14 +1,19 @@ import time +import pytest +import platform import numpy as np import numpy.lib.recfunctions -from mpi4py import MPI import libensemble.manager as man import libensemble.tests.unit_tests.setup as setup -libE_specs = {"mpi_comm": MPI.COMM_WORLD} +if platform.system() != "Windows": + from mpi4py import MPI + libE_specs = {"mpi_comm": MPI.COMM_WORLD} + +@pytest.mark.extra def test_term_test_1(): # termination_test should be True when we want to stop @@ -19,6 +24,7 @@ def test_term_test_1(): assert not mgr.term_test() +@pytest.mark.extra def test_term_test_2(): # Test 2 - these could also be sep - with a setup or fixture.... # Shouldn't terminate @@ -39,6 +45,7 @@ def test_term_test_2(): assert mgr.term_test() +@pytest.mark.extra def test_term_test_3(): # Test 3. # Terminate because enough time has passed diff --git a/libensemble/tests/unit_tests/test_mpi4py.py b/libensemble/tests/unit_tests/test_mpi4py.py index f4a23980ee..606c38f79a 100644 --- a/libensemble/tests/unit_tests/test_mpi4py.py +++ b/libensemble/tests/unit_tests/test_mpi4py.py @@ -1,3 +1,7 @@ +import pytest + + +@pytest.mark.extra def test_mpi4py(): from mpi4py import MPI diff --git a/libensemble/tests/unit_tests/test_persistent_aposmm.py b/libensemble/tests/unit_tests/test_persistent_aposmm.py index ea68193f0b..7b5fa7749d 100644 --- a/libensemble/tests/unit_tests/test_persistent_aposmm.py +++ b/libensemble/tests/unit_tests/test_persistent_aposmm.py @@ -1,12 +1,14 @@ import pytest +import platform import multiprocessing -multiprocessing.set_start_method("fork", force=True) - import libensemble.gen_funcs libensemble.gen_funcs.rc.aposmm_optimizers = "nlopt" -from libensemble.gen_funcs.persistent_aposmm import aposmm, update_history_optimal + +if platform.system() in ["Linux", "Darwin"]: + multiprocessing.set_start_method("fork", force=True) + from libensemble.gen_funcs.persistent_aposmm import aposmm, update_history_optimal import numpy as np import libensemble.tests.unit_tests.setup as setup diff --git a/libensemble/tests/unit_tests/test_scipy.py b/libensemble/tests/unit_tests/test_scipy.py index cb8347bd72..e50a525229 100644 --- a/libensemble/tests/unit_tests/test_scipy.py +++ b/libensemble/tests/unit_tests/test_scipy.py @@ -10,7 +10,6 @@ def test_cdist_issue(): pytest.skip("scipy or its dependencies not importable. Skipping.") """There is an issue (at least in scipy 1.1.0) with cdist segfaulting.""" - H = np.zeros( 20, dtype=[ @@ -40,7 +39,6 @@ def test_cdist_issue(): @pytest.mark.extra def test_save(): """Seeing if I can save parts of the H array.""" - from libensemble.tests.regression_tests.support import uniform_or_localopt_gen_out as gen_out n = 2 diff --git a/libensemble/tests/unit_tests/test_sim_dir_properties.py b/libensemble/tests/unit_tests/test_sim_dir_properties.py index 71fc5759b7..29e79370d0 100644 --- a/libensemble/tests/unit_tests/test_sim_dir_properties.py +++ b/libensemble/tests/unit_tests/test_sim_dir_properties.py @@ -1,44 +1,40 @@ import os +import pytest import shutil import numpy as np from libensemble.output_directory import EnsembleDirectory from libensemble.utils.loc_stack import LocationStack +from libensemble.utils.misc import extract_H_ranges def test_range_single_element(): """Single H_row labeling""" - work = {"H_fields": ["x", "num_nodes", "procs_per_node"], "libE_info": {"H_rows": np.array([5]), "workerID": 1}} - assert EnsembleDirectory.extract_H_ranges(work) == "5", "Failed to correctly parse single H row" + assert extract_H_ranges(work) == "5", "Failed to correctly parse single H row" def test_range_two_separate_elements(): """Multiple H_rows, non-sequential""" - work = {"H_fields": ["x", "num_nodes", "procs_per_node"], "libE_info": {"H_rows": np.array([2, 8]), "workerID": 1}} - assert EnsembleDirectory.extract_H_ranges(work) == "2_8", "Failed to correctly parse nonsequential H rows" + assert extract_H_ranges(work) == "2_8", "Failed to correctly parse nonsequential H rows" def test_range_two_ranges(): """Multiple sequences of H_rows""" - work = { "H_fields": ["x", "num_nodes", "procs_per_node"], "libE_info": {"H_rows": np.array([0, 1, 2, 3, 7, 8]), "workerID": 1}, } - assert EnsembleDirectory.extract_H_ranges(work) == "0-3_7-8", "Failed to correctly parse multiple H ranges" + assert extract_H_ranges(work) == "0-3_7-8", "Failed to correctly parse multiple H ranges" def test_range_mixes(): """Mix of single rows and sequences of H_rows""" - work = { "H_fields": ["x", "num_nodes", "procs_per_node"], "libE_info": {"H_rows": np.array([2, 3, 4, 6, 8, 9, 11, 14]), "workerID": 1}, } - assert ( - EnsembleDirectory.extract_H_ranges(work) == "2-4_6_8-9_11_14" - ), "Failed to correctly parse H row single elements and ranges." + assert extract_H_ranges(work) == "2-4_6_8-9_11_14", "Failed to correctly parse H row single elements and ranges." def test_copy_back(): @@ -88,6 +84,7 @@ def test_worker_dirs_but_no_sim_dirs(): shutil.rmtree(dir) +@pytest.mark.extra def test_loc_stack_FileExists_exceptions(): inputdir = "./calc" copyfile = "./calc/copy" diff --git a/libensemble/tests/unit_tests_logger/test_logger.py b/libensemble/tests/unit_tests_logger/test_logger.py index eaaf811e6f..18555dd517 100644 --- a/libensemble/tests/unit_tests_logger/test_logger.py +++ b/libensemble/tests/unit_tests_logger/test_logger.py @@ -72,7 +72,10 @@ def test_set_filename(): with open(alt_name, "r") as f: line = f.readline() assert "Cannot set filename after loggers initialized" in line - os.remove(alt_name) + try: + os.remove(alt_name) + except PermissionError: # windows only + pass logs = LogConfig.config logs.logger_set = True diff --git a/libensemble/tests/unit_tests_nompi/conftest.py b/libensemble/tests/unit_tests_nompi/conftest.py new file mode 100644 index 0000000000..3909b231f0 --- /dev/null +++ b/libensemble/tests/unit_tests_nompi/conftest.py @@ -0,0 +1,21 @@ +# https://stackoverflow.com/questions/47559524/pytest-how-to-skip-tests-unless-you-declare-an-option-flag/61193490#61193490 + +import pytest + + +def pytest_addoption(parser): + parser.addoption("--runextra", action="store_true", default=False, help="run extra tests") + + +def pytest_configure(config): + config.addinivalue_line("markers", "extra: mark test as extra to run") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--runextra"): + # --runextra given in cli: do not skip extra tests + return + skip_extra = pytest.mark.skip(reason="need --runextra option to run") + for item in items: + if "extra" in item.keywords: + item.add_marker(skip_extra) diff --git a/libensemble/tests/unit_tests_nompi/test_aaa_comms.py b/libensemble/tests/unit_tests_nompi/test_aaa_comms.py index 180bba29f6..6dde4ff492 100644 --- a/libensemble/tests/unit_tests_nompi/test_aaa_comms.py +++ b/libensemble/tests/unit_tests_nompi/test_aaa_comms.py @@ -9,6 +9,7 @@ """ import time +import pytest import signal import libensemble.comms.comms as comms @@ -55,6 +56,7 @@ def ignore_handler(signum, frame): print("Ignoring SIGTERM") +@pytest.mark.extra def test_qcomm_proc_terminate3(): "Test that a QCommProcess ignoring SIGTERM manages." diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index e6d3edfb18..79ae6ed31e 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -3,7 +3,7 @@ from libensemble.message_numbers import EVAL_SIM_TAG, EVAL_GEN_TAG from libensemble.resources.resources import Resources from libensemble.resources.scheduler import ResourceScheduler, InsufficientFreeResources # noqa: F401 -from libensemble.output_directory import EnsembleDirectory +from libensemble.utils.misc import extract_H_ranges logger = logging.getLogger(__name__) # For debug messages - uncomment @@ -179,9 +179,7 @@ def sim_work(self, wid, H, H_fields, H_rows, persis_info, **libE_info): } logger.debug( - "Alloc func packing SIM work for worker {}. Packing sim_ids: {}".format( - wid, EnsembleDirectory.extract_H_ranges(work) or None - ) + "Alloc func packing SIM work for worker {}. Packing sim_ids: {}".format(wid, extract_H_ranges(work) or None) ) return work @@ -223,9 +221,7 @@ def gen_work(self, wid, H_fields, H_rows, persis_info, **libE_info): } logger.debug( - "Alloc func packing GEN work for worker {}. Packing sim_ids: {}".format( - wid, EnsembleDirectory.extract_H_ranges(work) or None - ) + "Alloc func packing GEN work for worker {}. Packing sim_ids: {}".format(wid, extract_H_ranges(work) or None) ) return work diff --git a/libensemble/tools/parse_args.py b/libensemble/tools/parse_args.py index d365b33713..469ad443e1 100644 --- a/libensemble/tools/parse_args.py +++ b/libensemble/tools/parse_args.py @@ -18,7 +18,10 @@ ) parser.add_argument("--nworkers", type=int, nargs="?", help="Number of local forked processes") parser.add_argument( - "--nsim_workers", type=int, nargs="?", help="Number of workers for sims. 1+ zero-resource gen worker will be added" + "--nsim_workers", + type=int, + nargs="?", + help="Number of workers for sims. 1+ unresourced workers for a persistent generator will be added", ) parser.add_argument("--nresource_sets", type=int, nargs="?", help="Number of resource sets") parser.add_argument("--workers", type=str, nargs="+", help="List of worker nodes") @@ -56,14 +59,14 @@ def _mpi_parse_args(args): # Convenience option which sets other libE_specs options. nsim_workers = args.nsim_workers if nsim_workers is not None: - libE_specs["zero_resource_workers"] = _get_zrw(nworkers, nsim_workers) + # libE_specs["zero_resource_workers"] = _get_zrw(nworkers, nsim_workers) + libE_specs["num_resource_sets"] = libE_specs.get("num_resource_sets", nsim_workers) return nworkers, is_manager, libE_specs, args.tester_args def _local_parse_args(args): """Parses arguments for forked processes using multiprocessing.""" - libE_specs = {"comms": "local"} nworkers = args.nworkers @@ -74,7 +77,8 @@ def _local_parse_args(args): nsim_workers = args.nsim_workers if nsim_workers is not None: nworkers = nworkers or nsim_workers + 1 - libE_specs["zero_resource_workers"] = _get_zrw(nworkers, nsim_workers) + # libE_specs["zero_resource_workers"] = _get_zrw(nworkers, nsim_workers) + libE_specs["num_resource_sets"] = libE_specs.get("num_resource_sets", nsim_workers) nworkers = nworkers or 4 libE_specs["nworkers"] = nworkers @@ -178,21 +182,23 @@ def parse_args(): --comms, Communications medium for manager and workers. Default is 'mpi'. --nworkers, (For 'local' or 'tcp' comms) Set number of workers. - --nsim_workers, (For 'local' or 'mpi' comms) A convenience option for common cases. - If used with no other criteria, will generate one additional - zero-resource worker for use as a generator. If the number of workers - has also been specified, will generate enough zero-resource workers to - match the other criteria. --nresource_sets, Explicitly set the number of resource sets. This sets libE_specs['num_resource_sets']. By default, resources will be divided by workers (excluding zero_resource_workers). + --nsim_workers, (For 'local' or 'mpi' comms) A convenience option for cases with + persistent generators - sets the number of simulation workers. + If used with no other criteria, one additional worker for running a + generator will be added, and the number of resource sets will be assigned + the given value. If '--nworkers' has also been specified, will generate + enough additional workers to match the other criteria. If '--nresource_sets' + is also specified, will not override resource sets. Example command lines: Run with 'local' comms and 4 workers $ python calling_script --comms local --nworkers 4 - Run with 'local' comms and 5 workers - one gen (no resources), and 4 sims. + Run with 'local' comms and 5 workers - one gen worker (no resources), and 4 sim workers. $ python calling_script --comms local --nsim_workers 4 Run with 'local' comms with 4 workers and 8 resource sets. The extra resource sets will diff --git a/libensemble/tools/persistent_support.py b/libensemble/tools/persistent_support.py index 0f302c077e..fbe5ed5ee5 100644 --- a/libensemble/tools/persistent_support.py +++ b/libensemble/tools/persistent_support.py @@ -1,5 +1,6 @@ from libensemble.message_numbers import STOP_TAG, PERSIS_STOP, UNSET_TAG, EVAL_GEN_TAG, EVAL_SIM_TAG, calc_type_strings import logging +import numpy as np logger = logging.getLogger(__name__) @@ -24,12 +25,16 @@ def __init__(self, libE_info, calc_type): ], "The calc_type: {} specifies neither a simulator nor generator.".format(self.calc_type) self.calc_str = calc_type_strings[self.calc_type] - def send(self, output, calc_status=UNSET_TAG): + def send(self, output, calc_status=UNSET_TAG, keep_state=False): """ Send message from worker to manager. :param output: Output array to be sent to manager :param calc_status: Optional, Provides a task status + :param keep_state: Optional, If True the manager will not modify its + record of the workers state (usually the manager changes the + worker's state to inactive, indicating the worker is ready to receive + more work, unless using active receive mode). :returns: None @@ -41,6 +46,8 @@ def send(self, output, calc_status=UNSET_TAG): else: libE_info = self.libE_info + libE_info["keep_state"] = keep_state + D = { "calc_out": output, "libE_info": libE_info, @@ -54,6 +61,8 @@ def recv(self, blocking=True): """ Receive message to worker from manager. + :param blocking: Optional, If True (default), will block until a message is received. + :returns: message tag, Work dictionary, calc_in array """ @@ -102,3 +111,16 @@ def send_recv(self, output, calc_status=UNSET_TAG): """ self.send(output, calc_status) return self.recv() + + def request_cancel_sim_ids(self, sim_ids): + """Request cancellation of sim_ids + + :param sim_ids: A list of sim_ids to cancel + + A message is sent to the manager to mark requested sim_ids as cancel_requested + """ + H_o = np.zeros(len(sim_ids), dtype=[("sim_id", int), ("cancel_requested", bool)]) + H_o["sim_id"] = sim_ids + H_o["cancel_requested"] = True + print(H_o) + self.send(H_o, keep_state=True) diff --git a/libensemble/utils/misc.py b/libensemble/utils/misc.py new file mode 100644 index 0000000000..583e7edfdf --- /dev/null +++ b/libensemble/utils/misc.py @@ -0,0 +1,23 @@ +""" +Misc internal functions +""" + +from itertools import groupby +from operator import itemgetter + + +def extract_H_ranges(Work): + """Convert received H_rows into ranges for labeling""" + work_H_rows = Work["libE_info"]["H_rows"] + if len(work_H_rows) == 1: + return str(work_H_rows[0]) + else: + # From https://stackoverflow.com/a/30336492 + ranges = [] + for diff, group in groupby(enumerate(work_H_rows.tolist()), lambda x: x[0] - x[1]): + group = list(map(itemgetter(1), group)) + if len(group) > 1: + ranges.append(str(group[0]) + "-" + str(group[-1])) + else: + ranges.append(str(group[0])) + return "_".join(ranges) diff --git a/libensemble/utils/runners.py b/libensemble/utils/runners.py new file mode 100644 index 0000000000..806b68258b --- /dev/null +++ b/libensemble/utils/runners.py @@ -0,0 +1,86 @@ +import logging +import logging.handlers + +from libensemble.message_numbers import EVAL_SIM_TAG, EVAL_GEN_TAG + +logger = logging.getLogger(__name__) + + +class Runners: + """Determines and returns methods for workers to run user functions. + + Currently supported: direct-call and funcX + """ + + def __init__(self, sim_specs, gen_specs): + self.sim_specs = sim_specs + self.gen_specs = gen_specs + self.sim_f = sim_specs["sim_f"] + self.gen_f = gen_specs.get("gen_f") + self.has_funcx_sim = len(sim_specs.get("funcx_endpoint", "")) > 0 + self.has_funcx_gen = len(gen_specs.get("funcx_endpoint", "")) > 0 + self.funcx_exctr = None + + if any([self.has_funcx_sim, self.has_funcx_gen]): + try: + from funcx import FuncXClient + from funcx.sdk.executor import FuncXExecutor + + self.funcx_exctr = FuncXExecutor(FuncXClient()) + + except ModuleNotFoundError: + logger.warning("funcX use detected but funcX not importable. Is it installed?") + + def make_runners(self): + """Creates functions to run a sim or gen. These functions are either + called directly by the worker or submitted to a funcX endpoint.""" + + def run_sim(calc_in, persis_info, libE_info): + """Determines how to run sim.""" + if self.has_funcx_sim and self.funcx_exctr: + result = self._funcx_result + else: + result = self._normal_result + + return result(calc_in, persis_info, self.sim_specs, libE_info, self.sim_f) + + if self.gen_specs: + + def run_gen(calc_in, persis_info, libE_info): + """Determines how to run gen.""" + if self.has_funcx_gen and self.funcx_exctr: + result = self._funcx_result + else: + result = self._normal_result + + return result(calc_in, persis_info, self.gen_specs, libE_info, self.gen_f) + + else: + run_gen = [] + + return {EVAL_SIM_TAG: run_sim, EVAL_GEN_TAG: run_gen} + + def _normal_result(self, calc_in, persis_info, specs, libE_info, user_f): + """User function called in-place""" + return user_f(calc_in, persis_info, specs, libE_info) + + def _funcx_result(self, calc_in, persis_info, specs, libE_info, user_f): + """User function submitted to funcX""" + from libensemble.worker import Worker + + libE_info["comm"] = None # 'comm' object not pickle-able + Worker._set_executor(0, None) # ditto for executor + + future = self.funcx_exctr.submit( + user_f, + calc_in, + persis_info, + specs, + libE_info, + endpoint_id=specs["funcx_endpoint"], + ) + remote_exc = future.exception() # blocks until exception or None + if remote_exc is None: + return future.result() + else: + raise remote_exc diff --git a/libensemble/version.py b/libensemble/version.py index a2fecb4576..ddb97c066f 100644 --- a/libensemble/version.py +++ b/libensemble/version.py @@ -1 +1 @@ -__version__ = "0.9.2" +__version__ = "0.9.2+dev" diff --git a/libensemble/worker.py b/libensemble/worker.py index ba1cdb1145..ae8e8196e7 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -17,7 +17,9 @@ from libensemble.message_numbers import calc_type_strings, calc_status_strings from libensemble.output_directory import EnsembleDirectory +from libensemble.utils.misc import extract_H_ranges from libensemble.utils.timer import Timer +from libensemble.utils.runners import Runners from libensemble.executors.executor import Executor from libensemble.resources.resources import Resources from libensemble.comms.logs import worker_logging_config @@ -134,72 +136,11 @@ def __init__(self, comm, dtypes, workerID, sim_specs, gen_specs, libE_specs): self.stats_fmt = libE_specs.get("stats_fmt", {}) self.calc_iter = {EVAL_SIM_TAG: 0, EVAL_GEN_TAG: 0} - self._run_calc = Worker._make_runners(sim_specs, gen_specs) + self._run_calc = Runners(sim_specs, gen_specs).make_runners() Worker._set_executor(self.workerID, self.comm) Worker._set_resources(self.workerID, self.comm) self.EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) - @staticmethod - def _funcx_result(funcx_exctr, user_f, calc_in, persis_info, specs, libE_info): - libE_info["comm"] = None # 'comm' object not pickle-able - Worker._set_executor(0, None) # ditto for executor - - future = funcx_exctr.submit(user_f, calc_in, persis_info, specs, libE_info, endpoint_id=specs["funcx_endpoint"]) - remote_exc = future.exception() # blocks until exception or None - if remote_exc is None: - return future.result() - else: - raise remote_exc - - @staticmethod - def _get_funcx_exctr(sim_specs, gen_specs): - funcx_sim = len(sim_specs.get("funcx_endpoint", "")) > 0 - funcx_gen = len(gen_specs.get("funcx_endpoint", "")) > 0 - - if any([funcx_sim, funcx_gen]): - try: - from funcx import FuncXClient - from funcx.sdk.executor import FuncXExecutor - - return FuncXExecutor(FuncXClient()), funcx_sim, funcx_gen - except ModuleNotFoundError: - logger.warning("funcX use detected but funcX not importable. Is it installed?") - return None, False, False - except Exception: - return None, False, False - else: - return None, False, False - - @staticmethod - def _make_runners(sim_specs, gen_specs): - """Creates functions to run a sim or gen. These functions are either - called directly by the worker or submitted to a funcX endpoint.""" - - funcx_exctr, funcx_sim, funcx_gen = Worker._get_funcx_exctr(sim_specs, gen_specs) - sim_f = sim_specs["sim_f"] - - def run_sim(calc_in, persis_info, libE_info): - """Calls or submits the sim func.""" - if funcx_sim and funcx_exctr: - return Worker._funcx_result(funcx_exctr, sim_f, calc_in, persis_info, sim_specs, libE_info) - else: - return sim_f(calc_in, persis_info, sim_specs, libE_info) - - if gen_specs: - gen_f = gen_specs["gen_f"] - - def run_gen(calc_in, persis_info, libE_info): - """Calls or submits the gen func.""" - if funcx_gen and funcx_exctr: - return Worker._funcx_result(funcx_exctr, gen_f, calc_in, persis_info, gen_specs, libE_info) - else: - return gen_f(calc_in, persis_info, gen_specs, libE_info) - - else: - run_gen = [] - - return {EVAL_SIM_TAG: run_sim, EVAL_GEN_TAG: run_gen} - @staticmethod def _set_rset_team(rset_team): """Pass new rset_team to worker resources""" @@ -257,7 +198,7 @@ def _handle_calc(self, Work, calc_in): # from output_directory.py if calc_type == EVAL_SIM_TAG: enum_desc = "sim_id" - calc_id = EnsembleDirectory.extract_H_ranges(Work) + calc_id = extract_H_ranges(Work) else: enum_desc = "Gen no" # Use global gen count if available @@ -318,7 +259,6 @@ def _handle_calc(self, Work, calc_in): def _get_calc_msg(self, enum_desc, calc_id, calc_type, timer, status): """Construct line for libE_stats.txt file""" - calc_msg = "{} {}: {} {}".format(enum_desc, calc_id, calc_type, timer) if self.stats_fmt.get("task_timing", False) or self.stats_fmt.get("task_datetime", False): @@ -336,7 +276,6 @@ def _get_calc_msg(self, enum_desc, calc_id, calc_type, timer, status): def _recv_H_rows(self, Work): """Unpacks Work request and receives any history rows""" - libE_info = Work["libE_info"] calc_type = Work["tag"] if len(libE_info["H_rows"]) > 0: @@ -351,7 +290,6 @@ def _recv_H_rows(self, Work): def _handle(self, Work): """Handles a work request from the manager""" - # Check work request and receive second message (if needed) libE_info, calc_type, calc_in = self._recv_H_rows(Work) @@ -385,7 +323,6 @@ def _handle(self, Work): def run(self): """Runs the main worker loop.""" - try: logger.info("Worker {} initiated on node {}".format(self.workerID, socket.gethostname())) diff --git a/scripts/liberegister b/scripts/liberegister new file mode 100644 index 0000000000..c4a42671d6 --- /dev/null +++ b/scripts/liberegister @@ -0,0 +1,164 @@ +#! /usr/bin/env python + +import os +import sys +import shutil +import argparse +from pathlib import Path +from libensemble.version import __version__ +from libensemble.tools.parse_args import parser as callscript_parser + +try: + from psij import Job, JobSpec + from psij.resource_spec import ResourceSpecV1 + from psij.job_attributes import JobAttributes + from psij.serialize import Export +except ModuleNotFoundError: + print(f"*** libEnsemble {__version__} ***") + print("\nThe PSI/J Python interface is not installed. Please install it via the following:\n") + print(" git clone https://github.com/ExaWorks/psi-j-python.git") + print(" cd psi-j-python; pip install -e .\n") + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + prog="liberegister", + description="Produce a PSI/J representation for a libEnsemble execution.", + epilog="Output representations can be passed to `libesubmit`", + parents=[callscript_parser], + conflict_handler="resolve", + ) + + parser.add_argument("calling_script", nargs="?") + + parser.add_argument( + "-o", + "--outfile", + type=str, + nargs="?", + help="Output PSI/J representation filename.", + default="libe-job.json", + ) + + parser.add_argument( + "-n", "--nnodes", type=int, nargs="?", help="Number of nodes", default=1 + ) + + parser.add_argument( + "-p", + "--python-path", + type=Path, + nargs="?", + help="Which Python to use.", + default="python", + ) + + choices = ["cobalt", "local", "flux", "lsf", "pbspro", "rp", "slurm"] + + parser.add_argument( + "-s", + "--scheduler", + choices=choices, + help="Which scheduler to use.", + default=None, + ) + + parser.add_argument( + "-j", + "--jobname", + type=str, + nargs="?", + help="Scheduler job name.", + default="libe-job", + ) + + parser.add_argument( + "-q", "--queue", type=str, nargs="?", help="Scheduler queue name.", default=None + ) + + parser.add_argument( + "-A", + "--project", + type=str, + nargs="?", + help="Project name for billing hours.", + default=None, + ) + + parser.add_argument( + "-t", + "--wallclock", + type=int, + nargs="?", + help="Total wallclock for job.", + default=30, + ) + + parser.add_argument( + "-d", + "--directory", + type=Path, + nargs="?", + help="Working directory for job.", + default=None, + ) + + jobargs, unknown = parser.parse_known_args(sys.argv[1:]) + + if not jobargs.calling_script: + parser.print_help() + sys.exit( + "\nMust supply a calling script, with the --comms and --nworkers options" + ) + + if not jobargs.calling_script.endswith(".py"): + parser.print_help() + sys.exit("\nFirst argument doesn't appear to be a Python script.") + + basename = jobargs.calling_script.split(".py")[0] + outfile_default = basename + ".json" + + executable = jobargs.python_path + + if jobargs.comms == "local": + arguments = [ + jobargs.calling_script, + "--comms", + jobargs.comms, + ] + + if jobargs.nworkers: + arguments.extend(["--nworkers", str(jobargs.nworkers)]) + + resources = ResourceSpecV1(node_count=jobargs.nnodes) + else: # jobargs.comms == "mpi": + arguments = [jobargs.calling_script] + resources = ResourceSpecV1( + process_count=jobargs.nworkers + 1, processes_per_node=1 + ) + + if jobargs.nsim_workers: + arguments.extend(['--nsim_workers', str(jobargs.nsim_workers)]) + + if jobargs.nresource_sets: + arguments.extend(['--nresource_sets', str(jobargs.nresource_sets)]) + + jobspec = JobSpec( + name=jobargs.jobname, + executable=str(executable), + arguments=arguments, + directory=jobargs.directory, + environment={"PYTHONNOUSERSITE": "1"}, + resources=resources, + attributes=JobAttributes( + duration=jobargs.wallclock, + queue_name=jobargs.queue, + project_name=jobargs.project, + ), + ) + + Export().export(obj=jobspec, dest=outfile_default) + print(f"*** libEnsemble {__version__} ***") + print( + f"Exported PSI/J serialization: {outfile_default}\nOptionally adjust any fields, or specify job attributes on submission to `libesubmit`." + ) diff --git a/scripts/libesubmit b/scripts/libesubmit new file mode 100644 index 0000000000..9adca85369 --- /dev/null +++ b/scripts/libesubmit @@ -0,0 +1,229 @@ +#! /usr/bin/env python + +import os +import sys +import time +import shutil +import argparse +from pathlib import Path + +from libensemble.version import __version__ +from libensemble.resources import node_resources + +try: + from tqdm.auto import tqdm +except ModuleNotFoundError: + print(f"*** libEnsemble {__version__} ***") + print("\ntqdm is not installed, but this only matters if libesubmit can't find your calling script.\n") + print("\ntqdm can be installed via:\n") + print(" pip install tqdm") + +try: + from psij import JobExecutor, Import, Export, JobSpec, Job + from psij.resource_spec import ResourceSpecV1 + from psij.job_attributes import JobAttributes +except ModuleNotFoundError: + print(f"*** libEnsemble {__version__} ***") + print("\nThe PSI/J Python interface is not installed. Please install it via the following:\n") + print(" git clone https://github.com/ExaWorks/psi-j-python.git") + print(" cd psi-j-python; pip install -e .\n") + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + prog="libesubmit", + description="Submit a libEnsemble PSI/J job representation for execution. Additional options may overwrite the input file.", + conflict_handler="resolve", + ) + + choices = { + "cobalt": "aprun", + "local": "mpirun", + "flux": "mpirun", + "lsf": "jsrun", + "pbspro": "mpirun", + "rp": "mpirun", + "slurm": "srun", + } + + parser.add_argument("-s", "--scheduler", choices=choices.keys(), required=True) + + parser.add_argument( + "-w", + "--wait", + action="store_true", + help="Wait for Job to complete before exiting.", + ) + + parser.add_argument( + "--dry", + action="store_true", + help="Parameterize and re-serialize a Job, without submitting.", + ) + + parser.add_argument( + "-n", "--nnodes", type=int, nargs="?", help="Number of nodes", default=1 + ) + + parser.add_argument( + "-p", + "--python-path", + type=Path, + nargs="?", + help="Which Python to use. Default is current Python.", + default=sys.executable, + ) + + parser.add_argument( + "-q", "--queue", type=str, nargs="?", help="Scheduler queue name.", default=None + ) + + parser.add_argument( + "-A", + "--project", + type=str, + nargs="?", + help="Scheduler project name.", + default=None, + ) + + parser.add_argument( + "-t", + "--wallclock", + type=int, + nargs="?", + help="Total wallclock for job. Default is 30 minutes.", + default=30, + ) + + parser.add_argument( + "-d", + "--directory", + type=Path, + nargs="?", + help="Working directory for job. Default is current directory.", + default=os.getcwd(), + ) + + jobargs, unknown = parser.parse_known_args(sys.argv[1:]) + + script = sys.argv[1] + if not script.endswith(".json"): + parser.print_help() + sys.exit("First argument doesn't appear to be a .json file.") + + print(f"*** libEnsemble {__version__} ***") + print(f"Imported PSI/J serialization: {script}. Preparing submission...") + + importer = Import() + jobspec = importer.load(script) + assert isinstance(jobspec, JobSpec), "Invalid input file." + + jobspec.directory = str(jobargs.directory) + jobspec.attributes.project_name = jobargs.project + jobspec.attributes.queue_name = jobargs.queue + if jobspec.executable == "python": + jobspec.executable = str(jobargs.python_path) + jobspec.attributes.duration = jobargs.wallclock + if jobspec.resources["node_count"] == 1: + jobspec.resources["node_count"] = jobargs.nnodes + + # we enforced passing a python script in liberegister + callscript = [i for i in jobspec.arguments if str(i).endswith(".py")][0] + print(f"Calling script: {callscript}") + + if callscript not in os.listdir(jobargs.directory) and not os.path.isfile( + callscript + ): + print("... not found in Job working directory!") + exit = input("Check somewhere else? (Y/N): ") + if exit.upper() != "Y": + print("Exiting") + sys.exit() + + home = os.path.expanduser("~") + check_dirs = [] + for i in os.listdir(home): + if os.path.isdir(os.path.join(home, i)) and "." not in i: + check_dirs.append(i) + + print(home + ":") + for i in enumerate(check_dirs): + print(f" {i[0]+1}. /{i[1]}") + + inchoice = input("Specify a starting directory: ") + choice = home + "/" + check_dirs[int(inchoice)-1] + + def walkdir(folder): + """Walk through every file in a directory""" + for dirpath, dirs, files in os.walk(folder, topdown=True): + for filename in files: + yield os.path.abspath(os.path.join(dirpath, filename)) + + print("preparing... ctrl+c to abort.") + filescount = 0 + for _ in tqdm(walkdir(choice)): + filescount += 1 + + print("detecting... ctrl+c to abort.") + print(home + ":") + candidate_script_paths = [] + try: + for filepath in tqdm(walkdir(choice), total=filescount): + if callscript in filepath.split("/"): + candidate_script_paths.append(filepath) + tqdm.write( + f" {len(candidate_script_paths)}. {filepath.split(choice)[1]}" + ) + + exit = input("Specify a detected script: ") + new_callscript = candidate_script_paths[int(exit) - 1] + + except KeyboardInterrupt: + exit = input( + "detection interrupted. ctrl+c again to exit, or specify a detected script: " + ) + new_callscript = candidate_script_paths[int(exit) - 1] + + jobspec.arguments[jobspec.arguments.index(callscript)] = new_callscript + + else: + print("...found! Proceeding.") + + # Little bit strange I have to re-initialize this class to re-serialize + if not jobspec.resources[ + "node_count" + ]: # running with MPI - need corresponding executor + jobspec.resources = ResourceSpecV1( + process_count=jobspec.resources["process_count"], + processes_per_node=1, + cpu_cores_per_process=64 + ) + jobspec.launcher = choices[jobargs.scheduler] + else: + jobspec.resources = ResourceSpecV1(node_count=jobspec.resources["node_count"]) + + jex = JobExecutor.get_instance(jobargs.scheduler) + job = Job() + + if job.id.split("-")[0] in script: + reserialdest = script + else: + reserialdest = job.id.split("-")[0] + "." + script + + stdout_path = job.id.split("-")[0] + "." + script.replace("json", "out") + stderr_path = job.id.split("-")[0] + "." + script.replace("json", "err") + jobspec.stdout_path = stdout_path + jobspec.stderr_path = stderr_path + + Export().export(obj=jobspec, dest=reserialdest) + + job.spec = jobspec + + if not jobargs.dry: + print("Submitting Job!:", job) + jex.submit(job) + + if jobargs.wait: + print("Waiting on Job completion...") + job.wait() diff --git a/setup.py b/setup.py index c396af7d3d..94ac971693 100644 --- a/setup.py +++ b/setup.py @@ -99,6 +99,10 @@ def run_tests(self): "sphinx_rtd_theme", ], }, + scripts=[ + "scripts/liberegister", + "scripts/libesubmit", + ], classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers",