diff --git a/.flake8 b/.flake8 index 3b0ea1dab..9a77d11b5 100644 --- a/.flake8 +++ b/.flake8 @@ -22,6 +22,9 @@ per-file-ignores = libensemble/__init__.py:F401 libensemble/libensemble/__init__.py:F401 + # worker uses regex with chars that resemble escape sequences + libensemble/libE_worker.py:W605 + # Need to turn of matching probes (before other imports) on some # systems/versions of MPI: libensemble/tests/standalone_tests/mpi_launch_test/create_mpi_jobs.py:E402 diff --git a/.travis.yml b/.travis.yml index 404cb2820..18f0bd5a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,15 +36,30 @@ jobs: env: MPI=mpich PY=3.7 COMMS_TYPE=t # tcp language: generic python: 3.7 - -# Below will update the OSX octave used on Travis. This was so slow that we -# were hitting the 50 minute limit on Travis. We no longer update octave in -# homebrew, and can no longer run regression # tests using octave on OSX. -# addons: -# homebrew: -# packages: -# - octave -# update: true + - os: osx + osx_image: xcode11.3 + env: MPI=mpich PY=3.8 COMMS_TYPE=m # mpi + language: generic + python: 3.8 + - os: osx + osx_image: xcode11.3 + env: MPI=mpich PY=3.8 COMMS_TYPE=l # local + language: generic + python: 3.8 + - os: osx + osx_image: xcode11.3 + env: MPI=mpich PY=3.8 COMMS_TYPE=t # tcp + language: generic + python: 3.8 + fast_finish: true + allow_failures: + - python: 3.8 + - os: osx + env: MPI=mpich PY=3.8 COMMS_TYPE=m # mpi + - os: osx + env: MPI=mpich PY=3.8 COMMS_TYPE=l # mpi + - os: osx + env: MPI=mpich PY=3.8 COMMS_TYPE=t # mpi services: - postgresql @@ -66,7 +81,7 @@ before_install: - conda info -a # For debugging conda issues - conda config --add channels conda-forge - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then - conda create --yes --name condaenv python=3.7; + conda create --yes --name condaenv python=$PY; else conda create --yes --name condaenv python=$TRAVIS_PYTHON_VERSION; fi @@ -90,7 +105,7 @@ install: - if [[ "$TRAVIS_PYTHON_VERSION" == "3.8" ]]; then conda install nlopt mpi4py scipy mpich; export PETSC_CONFIGURE_OPTIONS='--with-batch'; - pip install petsc petsc4py; + conda install petsc4py; else conda install nlopt petsc4py petsc $MUMPS mpi4py scipy $MPI; fi @@ -98,6 +113,8 @@ install: # Begin: Dependencies only for regression tests - pip install DFO-LS - pip install deap + - conda install psutil + - pip install mpmath - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then pip install scikit-build packaging Tasmanian --user; fi @@ -125,7 +142,7 @@ before_script: # Run test (-z show output) script: - - ./libensemble/tests/run-tests.sh -z -$COMMS_TYPE + - ./libensemble/tests/run-tests.sh -A "-W error" -z -$COMMS_TYPE # Track code coverage after_success: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 620a97051..a02ca6342 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,54 @@ Release Notes Below are the notes from all libEnsemble releases. +Release 0.7.1 +------------- + +:Date: Oct 15, 2020 + +Dependencies: + +* ``psutils`` is now a required dependency. (#478 #491) + +API additions: + +* Executor updates: + + * Addition of a zero-resource worker option for persistent gens (does not allocate nodes to gen). (#500) + * Multiple applications can be registered to the Executor (and submitted) by name. (#498) + * Wait function added to Tasks. (#499) + +* Gen directories can now be created with options analogous to those for sim dirs. (#349 / #489) + +Other changes: + +* Improve comms efficiency (Repack fields when NumPy version 1.15+). (#511) +* Fix multiprocessing error on macOS/Python3.8 (Use 'fork' instead of 'spawn'). (#502 / #503) + +Updates to example functions: + +* Allow APOSMM to trigger ensemble exit when condition reached. (#507) +* Improvement in how persistent APOSMM shuts down subprocesses (preventing PETSc MPI-abort). (#478) + +Documentation: + +* APOSMM Tutorial added. (#468) +* Writing guide for user functions added to docs (e.g., creating sim_f, gen_f, alloc_f). (#510) +* Addition of posters and presentations section to docs (inc. Jupyter notebooks/binder links). (#492 #497) + +:Note: + +* Tested platforms include Linux, MacOS, Theta (Cray XC40/Cobalt), Summit (IBM Power9/LSF), Bebop (Cray CS400/Slurm), and Bridges (HPE system at PSC). +* Cori (Cray XC40/Slurm) was not tested with release code due to system issues. +* Tested Python versions: (Cpython) 3.5, 3.6, 3.7, 3.8. + +:Known issues: + +* We currently recommended running in Central mode on Bridges as distributed runs are experiencing hangs. +* OpenMPI does not work with direct MPI job launches in mpi4py comms mode, since it does not support nested MPI launches + (Either use local mode or Balsam job controller). +* See known issues section in the documentation for more issues. + Release 0.7.0 ------------- @@ -20,34 +68,34 @@ Breaking API changes: API additions: -* Added gen_funcs.rc configuration framework with option to select APOSMM Optimizers for import (#444) -* Provide ``alloc_specs`` defaults via `alloc_funcs.defaults` module (#325) -* Added ``extra_args`` option to the Executor submit function to allow addition of arbitrary MPI runner options (#445) -* Added ``custom_info`` argument to MPI Executor to allow overriding of detected settings (#448) -* Added ``libE_specs`` option to disable log files (#368) +* Added gen_funcs.rc configuration framework with option to select APOSMM Optimizers for import. (#444) +* Provide ``alloc_specs`` defaults via `alloc_funcs.defaults` module. (#325) +* Added ``extra_args`` option to the Executor submit function to allow addition of arbitrary MPI runner options. (#445) +* Added ``custom_info`` argument to MPI Executor to allow overriding of detected settings. (#448) +* Added ``libE_specs`` option to disable log files. (#368) Other changes: * Added libEnsemble Conda package, hosted on conda-forge. -* Bugfix: Intermittent failures with repeated libE calls under `mpi4py` comms +* Bugfix: Intermittent failures with repeated libE calls under `mpi4py` comms. Every libE call now uses its own duplicate of provided communicator and closes out. (#373/#387) * More accurate timing in `libE_stats.txt`. (#318) * Addition of new post-processing scripts. Updates to example functions: -* Persistent APOSMM is now the recommended APOSMM (`aposmm.py` renamed to `old_aposmm.py`) (#435) +* Persistent APOSMM is now the recommended APOSMM (`aposmm.py` renamed to `old_aposmm.py`). (#435) * New alloc/gen func: Finite difference parameters with noise estimation. (#350) * New example gen func: Tasmanian UQ generator. (#351) * New example gen func: Deap/NSGA2 generator. (#407) * New example gen func to interface with VTMOP. -* New example sim func: Borehole (#367) -* New example use-case: WarpX/APOSMM (#425) +* New example sim func: Borehole. (#367) +* New example use-case: WarpX/APOSMM. (#425) :Note: * Tested platforms include Linux, MacOS, Theta (Cray XC40/Cobalt), Summit (IBM Power9/LSF), Bebop (Cray CS400/Slurm), Cori (Cray XC40/Slurm), and Bridges (HPE system at PSC). -* Tested Python versions: (Cpython) 3.5, 3.6, 3.7, 3.8 +* Tested Python versions: (Cpython) 3.5, 3.6, 3.7, 3.8. :Known issues: @@ -61,21 +109,21 @@ Release 0.6.0 API changes: -* sim/gen/alloc_specs options that do not directly involve these routines are moved to libE_specs (see docs) (#266, #269) -* sim/gen/alloc_specs now require user-defined attributes to be added under the ``'user'`` field (see docs and examples) (#266, #269). -* Addition of a utils module to help users create calling scripts. Includes an argument parser and utility functions (#308). -* check_inputs() function is moved to the utils module (#308). -* The libE_specs option ``nprocesses`` has been changed to ``nworkers`` (#235) +* sim/gen/alloc_specs options that do not directly involve these routines are moved to libE_specs (see docs). (#266, #269) +* sim/gen/alloc_specs now require user-defined attributes to be added under the ``'user'`` field (see docs and examples). (#266, #269) +* Addition of a utils module to help users create calling scripts. Includes an argument parser and utility functions. (#308) +* check_inputs() function is moved to the utils module. (#308) +* The libE_specs option ``nprocesses`` has been changed to ``nworkers``. (#235) New example functions: -* Addition of a persistent APOSMM generator function (#217). +* Addition of a persistent APOSMM generator function. (#217) Other changes: -* Overhaul of documentation, including HPC platform guides and a new pdf structure (inc. #232, #282) -* Addition of OpenMP threading and GPU support to forces test (#250). -* Balsam job_controller now tested on Travis (#47) +* Overhaul of documentation, including HPC platform guides and a new pdf structure. (inc. #232, #282) +* Addition of OpenMP threading and GPU support to forces test. (#250) +* Balsam job_controller now tested on Travis. (#47) :Note: @@ -97,7 +145,7 @@ Release 0.5.2 * All output from libEnsemble goes via logger. MANAGER_WARNING level added. This level and above are echoed to stderr by default. API option to change echo level. * Simulation directories are created only during sim_f calls are suffixed by _worker. #146 * New user function libE.check_inputs() can be used to check valid configuration of inputs. Can be called in serial or under MPI (see libE API). #65 -* Installation option has been added to install dependencies used in tests ``pip install libensemble[extras]`` +* Installation option has been added to install dependencies used in tests ``pip install libensemble[extras]``. * A profiling option has been added to sim_specs. #170 * Results comparison scripts have been included for convenience. @@ -116,14 +164,14 @@ Release 0.5.1 :Date: July 11, 2019 -* Fixed LSF resource detection for large jobs on LSF systems (e.g., Summit) #184 -* Added support for macOS #182 -* Improved the documentation (including addition of beginner's tutorial and FAQ) +* Fixed LSF resource detection for large jobs on LSF systems (e.g., Summit). #184 +* Added support for macOS. #182 +* Improved the documentation (including addition of beginner's tutorial and FAQ). :Note: * Tested platforms include Local Linux, Theta (Cray XC40/Cobalt), Summit (IBM Power9/LSF), and Bebop (Cray CS400/Slurm). -* Tested Python versions: (Cpython) 3.4, 3.5, 3.6, 3.7 +* Tested Python versions: (Cpython) 3.4, 3.5, 3.6, 3.7. :Known issues: @@ -134,19 +182,19 @@ Release 0.5.0 :Date: May 22, 2019 -* Added local (multiprocessing) and TCP options for manager/worker communications, in addition to mpi4py (#42). +* Added local (multiprocessing) and TCP options for manager/worker communications, in addition to mpi4py. (#42). * Example: libEnsemble can be run on MOM/launch nodes (e.g., those of ALCF/Theta & OLCF/Summit) and can remotely detect compute resources. * Example: libEnsemble can be run on a system without MPI. * Example: libEnsemble can be run with a local manager and remote TCP workers. * Added support for Summit/LSF scheduler in job controller. -* MPI job controller detects and retries launches on failure; adding resilience (#143). -* Job controller supports option to extract/print job times in libE_stats.txt (#136). -* Default logging level changed to INFO (#164). -* Logging interface added, which allows user to change logging level and file (#110). +* MPI job controller detects and retries launches on failure; adding resilience. (#143) +* Job controller supports option to extract/print job times in libE_stats.txt. (#136) +* Default logging level changed to INFO. (#164) +* Logging interface added, which allows user to change logging level and file. (#110) * All worker logging and calculation stats are routed through manager. -* libEnsemble can be run without a gen_func, for example, when using a previously computed random sample (#122). +* libEnsemble can be run without a gen_func, for example, when using a previously computed random sample. (#122) * Aborts dump persis_info with the history. :Note: @@ -167,22 +215,22 @@ Release 0.4.1 :Date: February 20, 2019 -* Logging no longer uses root logger (also added option to change libEnsemble log level) (#105) -* Added wait_on_run option for job controller launch to block until jobs have started (#111) -* persis_info can be passed to sim as well as gen functions (#112) -* Postprocessing scripts added to create performance/utilization graphs (#102) -* New scaling test added (not part of current CI test suite) (#114) +* Logging no longer uses root logger (also added option to change libEnsemble log level). (#105) +* Added wait_on_run option for job controller launch to block until jobs have started. (#111) +* persis_info can be passed to sim as well as gen functions. (#112) +* Postprocessing scripts added to create performance/utilization graphs. (#102) +* New scaling test added (not part of current CI test suite). (#114) Release 0.4.0 ------------- :Date: November 7, 2018 -* Separated job controller classes into different modules including a base class (API change) -* Added central_mode run option to distributed type (MPI) job_controllers (API addition) (#93) -* Made poll and kill job methods (API change) -* In job_controller, set_kill_mode is removed and replaced by a wait argument for a hard kill (API change) -* Removed register module - incorporated into job_controller (API change) +* Separated job controller classes into different modules including a base class (API change). +* Added central_mode run option to distributed type (MPI) job_controllers (API addition). (#93) +* Made poll and kill job methods (API change). +* In job_controller, set_kill_mode is removed and replaced by a wait argument for a hard kill (API change). +* Removed register module - incorporated into job_controller (API change). * APOSMM has improved asynchronicity when batch mode is false (with new example). (#96) * Manager errors (instead of hangs) when alloc_f or gen_f don't return work when all workers are idle. (#95) @@ -195,17 +243,17 @@ Release 0.3.0 :Date: September 7, 2018 -* Issues with killing jobs have been fixed (#21) -* Fixed job_controller manager_poll to work with multiple jobs (#62) +* Issues with killing jobs have been fixed. (#21) +* Fixed job_controller manager_poll to work with multiple jobs. (#62) * API change: persis_info now included as an argument to libE and is returned from libE instead of gen_info * Gen funcs: aposmm_logic module renamed to aposmm. * New example gen and allocation functions. -* Updated Balsam launch script (with new Balsam workflow) -* History is dumped to file on manager or worker exception and MPI aborted (with exit code 1) (#46) -* Default logging level changed to DEBUG and redirected to file ensemble.log -* Added directory of standalone tests (comms, job kills, and nested MPI launches) -* Improved and speeded up unit tests (#68) -* Considerable documentation enhancements +* Updated Balsam launch script (with new Balsam workflow). +* History is dumped to file on manager or worker exception and MPI aborted (with exit code 1). (#46) +* Default logging level changed to DEBUG and redirected to file ensemble.log. +* Added directory of standalone tests (comms, job kills, and nested MPI launches). +* Improved and speeded up unit tests. (#68) +* Considerable documentation enhancements. :Known issues: diff --git a/MANIFEST.in b/MANIFEST.in index a163b642a..762b9bc8f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,8 +4,10 @@ include *.rst include LICENSE include CONTRIBUTING include install/* +include .flake8 recursive-include libensemble * +recursive-include postproc_scripts * recursive-include examples * recursive-exclude * __pycache__ recursive-exclude * *.py[co] @@ -14,4 +16,4 @@ recursive-exclude * *.o recursive-exclude * *.npy recursive-exclude * *.gitignore -recursive-include docs *.pdf *.rst conf.py Makefile make.bat +recursive-include docs * diff --git a/README.rst b/README.rst index 00122a14c..e5f6f8ca3 100644 --- a/README.rst +++ b/README.rst @@ -64,6 +64,7 @@ Required dependencies: * Python_ 3.5 or above * NumPy_ +* psutil_ For libEnsemble running with the mpi4py parallelism: @@ -213,7 +214,7 @@ Resources David Bindel and John-Luke Navarro}, title = {{libEnsemble} Users Manual}, institution = {Argonne National Laboratory}, - number = {Revision 0.7.0}, + number = {Revision 0.7.1}, year = {2020}, url = {https://buildmedia.readthedocs.org/media/pdf/libensemble/latest/libensemble.pdf} } @@ -221,8 +222,8 @@ Resources .. after_resources_rst_tag .. _Balsam: https://www.alcf.anl.gov/support-center/theta/balsam -.. _Coveralls: https://coveralls.io/github/Libensemble/libensemble?branch=master .. _Conda: https://docs.conda.io/en/latest/ +.. _Coveralls: https://coveralls.io/github/Libensemble/libensemble?branch=master .. _DFO-LS: https://github.com/numericalalgorithmsgroup/dfols .. _GitHub: https://github.com/Libensemble/libensemble .. _libEnsemble mailing list: https://lists.mcs.anl.gov/mailman/listinfo/libensemble @@ -235,7 +236,8 @@ Resources .. _NumPy: http://www.numpy.org .. _petsc4py: https://bitbucket.org/petsc/petsc4py .. _PETSc: http://www.mcs.anl.gov/petsc -.. _poster: https://figshare.com/articles/LibEnsemble_PETSc_TAO-_Sustaining_a_library_for_dynamic_ensemble-based_computations/7765454 +.. _poster: https://figshare.com/articles/libEnsemble_A_Python_Library_for_Dynamic_Ensemble-Based_Computations/12559520 +.. _psutil: https://pypi.org/project/psutil/ .. _PyPI: https://pypi.org .. _pytest-cov: https://pypi.org/project/pytest-cov/ .. _pytest-timeout: https://pypi.org/project/pytest-timeout/ diff --git a/binder/environment.yml b/binder/environment.yml new file mode 100644 index 000000000..8e531460f --- /dev/null +++ b/binder/environment.yml @@ -0,0 +1,11 @@ +name: libensemble-python + +channels: + - conda-forge + +dependencies: + - numpy + - openmpi + - openmpi-mpicc + - mpi4py + - libensemble diff --git a/docs/FAQ.rst b/docs/FAQ.rst index d7e9f5bf8..88e5f0d62 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -25,15 +25,15 @@ This may also occur with two processes if you are using a persistent generator. The generator will occupy the one worker, leaving none to run simulation functions. **I keep getting: "Not enough processors per worker to honor arguments." when -using the executor. Can I submit tasks to allocated processors anyway?** +using the Executor. Can I submit tasks to allocated processors anyway?** Automatic partitioning of resources can be disabled if you want to oversubscribe -(often if testing on a local machine) by configuring the executor with +(often if testing on a local machine) by configuring the Executor with ``auto_resources=False``. For example:: exctr = MPIExecutor(auto_resources=False) -Note that the executor ``submit()`` method has a parameter ``hyperthreads`` +Note that the Executor ``submit()`` method has a parameter ``hyperthreads`` which will attempt to use all hyperthreads/SMT threads available if set to ``True``. **FileExistsError: [Errno 17] File exists: './ensemble'** @@ -127,7 +127,7 @@ to ``pdb``. How well this works varies by system. :: **Can I use the MPI Executor when running libEnsemble with multiprocessing?** -Yes. The executor type determines only how libEnsemble workers +Yes. The Executor type determines only how libEnsemble workers execute and interact with user applications and is independent of ``comms`` chosen for manager/worker communications. diff --git a/docs/conf.py b/docs/conf.py index 5bb555319..8a588cbc8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -30,7 +30,7 @@ class Mock(MagicMock): def __getattr__(cls, name): return MagicMock() -MOCK_MODULES = ['argparse', 'numpy', 'mpi4py' , 'dfols', 'scipy', 'numpy.lib', 'numpy.lib.recfunctions', 'math', 'petsc4py', 'PETSc', 'nlopt', 'scipy.spatial', 'scipy.spatial.distance', 'scipy.io', 'deap', 'Tasmanian', 'numpy.linalg'] +MOCK_MODULES = ['argparse', 'numpy', 'mpi4py' , 'dfols', 'scipy', 'numpy.lib', 'numpy.lib.recfunctions', 'math', 'petsc4py', 'PETSc', 'nlopt', 'scipy.spatial', 'scipy.spatial.distance', 'scipy.io', 'deap', 'Tasmanian', 'numpy.linalg', 'mpmath', 'psutil'] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) #from libensemble import * @@ -66,7 +66,8 @@ def __getattr__(cls, name): 'sphinx.ext.napoleon', # 'sphinx.ext.autosectionlabel', # 'sphinx.ext.intersphinx', - 'sphinx.ext.imgconverter'] + 'sphinx.ext.imgconverter', + 'sphinx.ext.mathjax'] # autosectionlabel_prefix_document = True # extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.imgconverter'] #breathe_projects = { "libEnsemble": "../code/src/xml/" } @@ -109,9 +110,9 @@ def __getattr__(cls, name): # built documents. # # The short X.Y version. -version = '0.7.0' +version = '0.7.1' # The full version, including alpha/beta/rc tags. -release = '0.7.0' +release = '0.7.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -123,7 +124,7 @@ def __getattr__(cls, name): # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' diff --git a/docs/data_structures/calc_status.rst b/docs/data_structures/calc_status.rst index 32c6d77e2..859476cb6 100644 --- a/docs/data_structures/calc_status.rst +++ b/docs/data_structures/calc_status.rst @@ -14,7 +14,7 @@ status of the calculation, since it could include multiple application runs. It can be added as a third return variable in sim_f or gen_f functions. The calc_status codes are in the ``libensemble.message_numbers`` module. -Example of ``calc_status`` used along with :ref:`executor` in sim_f: +Example of ``calc_status`` used along with :ref:`Executor` in sim_f: .. code-block:: python :linenos: diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index e451ecfe9..b8180cf6f 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -25,17 +25,26 @@ Specifications for libEnsemble:: This will create a directory for each simulation, even if no sim_input_dir is specified. If False, all workers operate within the ensemble directory described below. Default: True + 'gen_dirs_make' [boolean] : + Whether to make generator-instance specific calculation directories for each gen call. + This will create a directory for each generator call, even if no gen_input_dir is specified. + If False, all workers operate within the ensemble directory. + Default: True 'ensemble_dir_path' [string] : - Path to main ensemble directory containing calculation (sim) directories. + Path to main ensemble directory containing calculation directories. Can serve as single working directory for all workers, or contain calculation directories. Default: './ensemble' 'use_worker_dirs' [boolean] : - Whether to organize calculation (sim) directories under worker-specific directories. + Whether to organize calculation directories under worker-specific directories. Default: False 'sim_dir_copy_files' [list] : List of paths to files or directories to copy into each sim dir, or ensemble dir. 'sim_dir_symlink_files' [list] : List of paths to files or directories to symlink into each sim dir. + 'gen_dir_copy_files' [list] : + List of paths to files or directories to copy into each gen dir, or ensemble dir. + 'gen_dir_symlink_files' [list] : + List of paths to files or directories to symlink into each gen dir. 'ensemble_copy_back' [boolean] : Whether to copy back directories within ensemble_dir_path back to launch location. Useful if ensemble_dir placed on node-local storage. @@ -43,6 +52,9 @@ Specifications for libEnsemble:: 'sim_input_dir' [string] : Copy this directory and it's contents for each simulation-specific directory. If not using calculation directories, contents are copied to the ensemble directory. + 'gen_input_dir' [string] : + Copy this directory and it's contents for each generator-instance specific directory. + If not using calculation directories, contents are copied to the ensemble directory. 'profile_worker' [boolean] : Profile using cProfile. Default: False 'disable_log_files' [boolean] : @@ -67,7 +79,7 @@ Specifications for libEnsemble:: libE_specs = {'comm': MPI.COMM_WORLD, 'comms': 'mpi', 'save_every_k_gens': 1000, - 'make_sim_dirs: True, + 'sim_dirs_make: True, 'ensemble_dir_path': '/scratch/ensemble' 'profile_worker': False} diff --git a/docs/dev_guide/release_management/release_platforms/rel_pypi.rst b/docs/dev_guide/release_management/release_platforms/rel_pypi.rst index 8c2d7ba54..879d3914f 100644 --- a/docs/dev_guide/release_management/release_platforms/rel_pypi.rst +++ b/docs/dev_guide/release_management/release_platforms/rel_pypi.rst @@ -7,8 +7,10 @@ libEnsemble is released on the Python Package Index (commonly known as PyPI). This enables users to ``pip install`` the package. The package is stored on PyPI in the form of a source distribution (commonly -known as a tarball). The tarball could be obtained from GitHub, although -historically this has been created with a checkout of libEnsemble from git. +known as a tarball). The tarball should be created as detailed below (which +creates the distribution package using the MANIFEST.in file in the git root +directory. Do not use the tarball on GitHub, which does not follow MANIFEST.in +and does not contain the required PKG-INFO file. You will need logon credentials for the libEnsemble PyPI. You will also need twine (which can be pip or Conda installed). diff --git a/docs/dev_guide/release_management/release_process.rst b/docs/dev_guide/release_management/release_process.rst index 38e850e3b..40cb1af3c 100644 --- a/docs/dev_guide/release_management/release_process.rst +++ b/docs/dev_guide/release_management/release_process.rst @@ -16,18 +16,23 @@ Before release - Release notes for this version are added to the documentation with release date, including a list of supported (tested) platforms. -- Version number is updated wherever it appears +- Version number is updated wherever it appears (and ``+dev`` suffix is removed) (in ``setup.py``, ``libensemble/__init__.py``, ``README.rst`` and twice in ``docs/conf.py``) - Year in ``README.rst`` under *Citing libEnsemble* and in ``docs/conf.py`` is checked for correctness. - ``setup.py`` and ``libensemble/__init__.py`` are checked to ensure all information is up to date. +- ``MANIFEST.in`` is checked. Locally, try out ``python setup.py sdist`` and check created tarball. + contains correct files and directories for PyPI package. + - Tests are run with source to be released (this may iterate): - On-line CI (currently Travis) tests must pass. - Scaling tests must be run on HPC platforms listed as supported in release notes. + Test variants by platform, launch mechanism, scale, and other factors can + be configured and exported by the libE-Templater_. - Coverage must not have decreased unless there is a justifiable reason. @@ -57,6 +62,8 @@ An administrator will take the following steps. - If the merge was made from a release branch (instead of develop), merge this branch into develop. +- Create a new commit on develop that appends ``+dev`` to the version number (wherever is appears). + After release ------------- @@ -64,3 +71,5 @@ After release on the kanban project board (inc. the release checklist). - Email libEnsemble mailing list, and notify the `everyone` channel in the libEnsemble Slack workspace. + +.. _libE-Templater: https://github.com/Libensemble/libE-templater diff --git a/docs/examples/alloc_funcs.rst b/docs/examples/alloc_funcs.rst index ec80d0b32..2ea4cca6d 100644 --- a/docs/examples/alloc_funcs.rst +++ b/docs/examples/alloc_funcs.rst @@ -8,6 +8,8 @@ Below are example allocation functions available in libEnsemble. .. note:: The default alloc_func is give_sim_work_first. +.. _gswf_label: + give_sim_work_first ------------------- .. automodule:: give_sim_work_first diff --git a/docs/examples/aposmm.rst b/docs/examples/aposmm.rst index 094efbe22..e676cfa23 100644 --- a/docs/examples/aposmm.rst +++ b/docs/examples/aposmm.rst @@ -2,7 +2,7 @@ APOSMM ----------------- Asynchronously Parallel Optimization Solver for finding Multiple Minima -(APOSMM) coordinates concurrent local optimization runs in order to identifying +(APOSMM) coordinates concurrent local optimization runs in order to identify many local minima. Configuring APOSMM @@ -21,6 +21,10 @@ Where ``optimizers`` is a string (or list of strings) from the available options To see the optimization algorithms supported, see `LocalOptInterfacer`_. +.. seealso:: + + :doc:`Persistent APOSMM Tutorial<../tutorials/aposmm_tutorial>` + Persistent APOSMM ^^^^^^^^^^^^^^^^^ diff --git a/docs/examples/calling_scripts.rst b/docs/examples/calling_scripts.rst index 891c37642..d7d5a9f1c 100644 --- a/docs/examples/calling_scripts.rst +++ b/docs/examples/calling_scripts.rst @@ -27,7 +27,7 @@ Electrostatic Forces with Executor This example is from a test for evaluating the scaling capabilities of libEnsemble by calculating particle electrostatic forces through a user application. This -application is registered with either the MPI or Balsam executor, then submitted +application is registered with either the MPI or Balsam Executor, then submitted for execution in the ``sim_f``. Note the use of the ``parse_args()`` and ``save_libE_output()`` convenience functions from the :doc:`tools<../utilities>` module. diff --git a/docs/executor/executor.rst b/docs/executor/executor.rst index 4965cc246..ae606ae7c 100644 --- a/docs/executor/executor.rst +++ b/docs/executor/executor.rst @@ -6,7 +6,7 @@ Executor Module See this :doc:`example` for usage. -See the executor APIs for optional arguments. +See the Executor APIs for optional arguments. .. toctree:: :maxdepth: 1 @@ -20,11 +20,11 @@ See the executor APIs for optional arguments. Task Class ---------- -Tasks are created and returned through the executor ``submit()`` function. Tasks +Tasks are created and returned through the Executor ``submit()`` function. Tasks can be polled and killed with the respective poll and kill functions. Task information can be queried through the task attributes below and the query functions. Note that the task attributes are updated only when they are -polled/killed (or through other task or executor functions). +polled/killed (or through other task or Executor functions). .. autoclass:: Task :members: @@ -39,7 +39,7 @@ Task Attributes Following is a list of task status and configuration attributes that can be retrieved from a task. -.. note:: These should not be set directly. Tasks are launched by the executor, +.. note:: These should not be set directly. Tasks are launched by the Executor, and task information can be queried through the task attributes below and the query functions. @@ -65,4 +65,4 @@ Run configuration attributes - some will be autogenerated: :task.stdout: (string) Name of file where the standard output of the task is written (in task.workdir) :task.stderr: (string) Name of file where the standard error of the task is written (in task.workdir) -A list of executor and task functions can be found under the ``executor`` module. +A list of Executor and task functions can be found under the ``executor`` module. diff --git a/docs/executor/mpi_executor.rst b/docs/executor/mpi_executor.rst index 44d5e3d43..019de4ec3 100644 --- a/docs/executor/mpi_executor.rst +++ b/docs/executor/mpi_executor.rst @@ -17,7 +17,7 @@ Class-specific Attributes ------------------------- Class-specific attributes can be set directly to alter the behavior of the MPI -executor. However, they should be used with caution, because they may not +Executor. However, they should be used with caution, because they may not be implemented in other executors. :max_submit_attempts: (int) Maximum number of launch attempts for a given diff --git a/docs/executor/overview.rst b/docs/executor/overview.rst index 871ac3c36..a33c1b3f9 100644 --- a/docs/executor/overview.rst +++ b/docs/executor/overview.rst @@ -4,7 +4,7 @@ Executor Overview A typical libEnsemble workflow will include launching tasks from a :ref:`sim_f` (or :ref:`gen_f`) running on a worker. We use "task" to represent an application submission by libEnsemble to the system, -may be a supercomputer, cluster, or other compute resource. +may be the compute nodes of a supercomputer, cluster, or other compute resource. The task could be launched via a subprocess call to ``mpirun`` or an alternative launcher such as ``aprun`` or ``jsrun``. The ``sim_f`` may then monitor this task, @@ -13,9 +13,9 @@ check output, and possibly kill the task. An **Executor** interface is provided by libEnsemble to remove the burden of system interaction from the user and ease the writing of portable user scripts that launch applications. The Executor provides the key functions: ``submit()``, -``poll()``, and ``kill()``. Task attributes can be queried to determine the status -following each of these commands. Functions are also provided to access and -interrogate files in the task's working directory. +``poll()``, ``wait()``, and ``kill()``. Task attributes can be queried to determine +the status following each of these commands. Functions are also provided to access +and interrogate files in the task's working directory. The main Executor class is an abstract class and is inherited by the MPIExecutor, for direct running of MPI applications. Another Executor is the BalsamMPIExecutor, @@ -45,11 +45,17 @@ In calling function:: exctr.register_calc(full_path=sim_app, calc_type='sim') +.. note:: + The *Executor* set up in the calling script is stored as a class attribute and + does **not** have to be passed to *libE*. It is extracted via *Executor.executor* + in the sim function (regardless of type). + In user sim func:: import time + from libensemble.executors.executor import Executor - # Will return executor (whether MPI or inherited such as Balsam). + # Will return Executor (whether MPI or inherited such as Balsam). exctr = Executor.executor task = exctr.submit(calc_type='sim', num_procs=8, app_args='input.txt', @@ -83,7 +89,7 @@ In user sim func:: See the :doc:`executor` interface for API. For a more realistic example see -the :doc:`Electrostatic Forces example <../examples/calling_scripts>`, +the :doc:`Electrostatic Forces example <../tutorials/executor_forces_tutorial>`, which launches the ``forces.x`` application as an MPI task. .. note:: @@ -91,9 +97,26 @@ which launches the ``forces.x`` application as an MPI task. **"jobs"** within Balsam, including within Balsam's database and when describing the state of a completed submission. +Note that applications can also be registered to the Executor using a name. The +equivalent lines in the above example would be: + +Calling script:: + + exctr.register_calc(full_path='/path/to/my/exe', app_name='forces_app') + +User sim func:: + + task = exctr.submit(app_name='forces_app', num_procs=8, app_args='input.txt', + stdout='out.txt', stderr='err.txt') + +The ``app_name`` can be any identfier, while ``full_path`` is the application to +be run. This approach allows multiple applications to be registered. + The MPIExecutor autodetects system criteria such as the appropriate MPI launcher and mechanisms to poll and kill tasks. It will also partition resources amongst -workers, ensuring that runs utilise different resources (e.g. nodes). +workers, ensuring that runs utilise different resources (e.g. nodes). The +``zero_resource_workers`` list option specifies workers that will not need +resources (e.g. a persistent generator might run on worker 1). Furthermore, the MPIExecutor offers resilience via the feature of re-launching tasks that fail because of system factors. @@ -103,4 +126,8 @@ Balsam_. Currently, these Executors launch at the application level within an existing resource pool. However, submissions to a batch scheduler may be supported in future Executors. +See :doc:`Running on HPC Systems<../platforms/platforms_index>` to see, with +diagrams, how common Executor options such as ``central_mode`` affect the +run configuration on clusters and supercomputers. + .. _Balsam: https://balsam.readthedocs.io/en/latest/ diff --git a/docs/function_guides/allocator.rst b/docs/function_guides/allocator.rst new file mode 100644 index 000000000..ca29c1b88 --- /dev/null +++ b/docs/function_guides/allocator.rst @@ -0,0 +1,126 @@ +Allocation Functions +==================== + +Although the included allocation functions, or ``alloc_f``'s are sufficient for +most users, those who want to fine-tune how data is passed to their ``gen_f`` +and ``sim_f`` can write their own. The ``alloc_f`` is unique since it is called +by the libEnsemble's manager instead of a worker. + +Most ``alloc_f`` function definitions written by users resemble:: + + def my_allocator(W, H, sim_specs, gen_specs, alloc_specs, persis_info): + +Where :doc:`W<../data_structures/worker_array>` is an array containing information +about each worker's state, and ``H`` is the *trimmed* History array, +containing rows initialized by the generator. + +Inside an ``alloc_f``, a :doc:`Work dictionary<../data_structures/work_dict>` is +instantiated:: + + Work = {} + +then populated with integer keys ``i`` for each worker and dictionary values to +give to those workers. An example Work dictionary from a run of +the ``test_1d_sampling.py`` regression test resembles:: + + { + 1: { + 'H_fields': ['x'], + 'persis_info': {'rand_stream': RandomState(...) at ..., 'worker_num': 1}, + 'tag': 1, + 'libE_info': {'H_rows': array([368])} + }, + + 2: { + 'H_fields': ['x'], + 'persis_info': {'rand_stream': RandomState(...) at ..., 'worker_num': 2}, + 'tag': 1, + 'libE_info': {'H_rows': array([369])} + }, + + 3: { + 'H_fields': ['x'], + 'persis_info': {'rand_stream': RandomState(...) at ..., 'worker_num': 3}, + 'tag': 1, + 'libE_info': {'H_rows': array([370])} + }, + + 4: { + 'H_fields': ['x'], + 'persis_info': {'rand_stream': RandomState(...) at ..., 'worker_num': 4}, + 'tag': 1, + 'libE_info': {'H_rows': array([371])} + } + } + +Based on information from the API reference above, this Work dictionary +describes instructions for each of the four workers to call the ``sim_f`` +with data from the ``'x'`` field and a given ``'H_row'`` from the +History array, and also pass ``persis_info``. + +Constructing these arrays and determining which workers are available +for receiving data is simplified by several functions available within the +``libensemble.tools.alloc_support`` module: + +.. currentmodule:: libensemble.tools.alloc_support +.. autofunction:: avail_worker_ids + +Many ``alloc_f`` routines loop over the available workers returned by the above +function to construct their Work dictionaries with the help of the following two +functions. + +.. currentmodule:: libensemble.tools.alloc_support +.. autofunction:: sim_work + +.. currentmodule:: libensemble.tools.alloc_support +.. autofunction:: gen_work + +Note that these two functions *append* an entry in-place to the Work dictionary +and additional parameters are appended to ``libE_info``. + +In practice, the structure of many allocation functions resemble:: + + Work = {} + ... + for ID in avail_worker_ids(W): + ... + if some_condition: + sim_work(Work, ID, chosen_H_fields, chosen_H_rows, persis_info) + ... + + if another_condition: + gen_work(Work, ID, chosen_H_fields, chosen_H_rows, persis_info) + ... + + return Work, persis_info + +The Work dictionary is returned to the manager with ``persis_info``. If ``1`` +is returned as third value, this instructs the run to stop. + +.. note:: An error occurs when the ``alloc_f`` returns nothing while + all workers are idle + +The final three functions available in the ``alloc_support`` module +are primarily for evaluating running generators: + +.. currentmodule:: libensemble.tools.alloc_support +.. autofunction:: test_any_gen + +.. currentmodule:: libensemble.tools.alloc_support +.. autofunction:: count_gens + +.. currentmodule:: libensemble.tools.alloc_support +.. autofunction:: count_persis_gens + +Descriptions of included allocation functions can be found :doc:`here<../examples/alloc_funcs>`. +The default allocation function used by libEnsemble if one isn't specified is +``give_sim_work_first``. During its worker ID loop, it checks if there's unallocated +work and assigns simulations for that work if so. Otherwise, it initializes +generators for up to ``'num_active_gens'`` instances. Other settings like +``batch_mode`` and blocking of non-active workers is also supported. See +:ref:`here` for more information about ``give_sim_work_first``. + +For a shorter, simpler example, here is the ``fast_alloc`` allocation function: + +.. literalinclude:: ../../libensemble/alloc_funcs/fast_alloc.py + :caption: /libensemble/alloc_funcs/fast_alloc.py diff --git a/docs/function_guides/function_guide_index.rst b/docs/function_guides/function_guide_index.rst new file mode 100644 index 000000000..31248910a --- /dev/null +++ b/docs/function_guides/function_guide_index.rst @@ -0,0 +1,20 @@ +====================== +Writing User Functions +====================== + +libEnsemble coordinates ensembles of calculations performed by three main +functions: a :ref:`Generator Function`, a :ref:`Simulator Function`, +and an :ref:`Allocation Function`, or ``gen_f``, ``sim_f``, and +``alloc_f`` respectively. These are all referred to as User Functions. Although +libEnsemble includes several ready-to-use User Functions like +:doc:`APOSMM<../examples/aposmm>`, it's expected many users will write their own or +adjust included functions for their own use-cases. +These guides describe common development patterns and optional components for +each kind of User Function. + +.. toctree:: + :maxdepth: 2 + + generator + simulator + allocator diff --git a/docs/function_guides/generator.rst b/docs/function_guides/generator.rst new file mode 100644 index 000000000..0ca4d69d3 --- /dev/null +++ b/docs/function_guides/generator.rst @@ -0,0 +1,112 @@ +Generator Functions +=================== + +As described in the :ref:`API`, the ``gen_f`` is called by a +libEnsemble worker via the following:: + + out = gen_f(H[gen_specs['in']][sim_ids_from_allocf], persis_info, gen_specs, libE_info) + +In practice, most ``gen_f`` function definitions written by users resemble:: + + def my_generator(H, persis_info, gen_specs, libE_info): + +Where :doc:`H<../data_structures/history_array>` is a selection of the +:doc:`History array<../history_output>`, determined by sim IDs from the +``alloc_f``, :doc:`persis_info<../data_structures/persis_info>` is a dictionary +containing state information, :doc:`gen_specs<../data_structures/gen_specs>` is a +dictionary containing pre-defined parameters for the ``gen_f``, and ``libE_info`` +is a dictionary containing libEnsemble-specific entries. See the API above for +more detailed descriptions of the parameters. + +.. note:: + + If the ``gen_f`` is a persistent generator, then ``gen_specs['in']`` will often be + empty if the ``alloc_f`` determines what fields to send to the generator. + +Typically users start by extracting their custom parameters initially defined +within ``gen_specs['user']`` in the calling script and defining a *local* History +array based on the datatype in ``gen_specs['out']``, to be returned. For example:: + + batch_size = gen_specs['user']['batch_size'] + local_H_out = np.zeros(batch_size, dtype=gen_specs['out']) + +This array should be populated by whatever values are generated within +the function. Finally, this array should be returned to libEnsemble +alongside ``persis_info``:: + + return local_H_out, persis_info + +.. note:: + + State ``gen_f`` information like checkpointing should be + appended to ``persis_info``. + +Persistent Generators +--------------------- + +While normal generators return after completing their calculation, persistent +generators receive Work units, perform computations, and communicate results +directly to the manager in a loop, not returning until explicitly instructed by +the manager. The calling worker becomes a dedicated :ref:`persistent worker`. +A ``gen_f`` is initiated as persistent by the ``alloc_f``, which also determines +which structures are sent to the ``gen_f``. In such cases, ``gen_specs`` is often +empty. + +Many users prefer persistent generators since they do not need to be +re-initialized every time their past work is completed and evaluated by a +simulation, and an can evaluate returned simulation results over the course of +an entire libEnsemble routine as a single function instance. + +Functions for a persistent generator to communicate directly with the manager +are available in the :ref:`libensemble.tools.gen_support` module. +Additional necessary resources are the status tags ``STOP_TAG``, ``PERSIS_STOP``, and +``FINISHED_PERSISTENT_GEN_TAG`` from ``libensemble.message_numbers``, with return +values from the ``gen_support`` functions compared to these tags to determine when +the generator should break its loop and return. + +Implementing the above functions is relatively simple: + +.. currentmodule:: libensemble.tools.gen_support +.. autofunction:: send_mgr_worker_msg + +This function call typically resembles:: + + send_mgr_worker_msg(libE_info['comm'], local_H_out[selected_IDs]) + +Note that ``send_mgr_worker_msg()`` has no return. + +.. currentmodule:: libensemble.tools.gen_support +.. autofunction:: get_mgr_worker_msg + +This function call typically resembles:: + + tag, Work, calc_in = get_mgr_worker_msg(libE_info['comm']) + + if tag in [STOP_TAG, PERSIS_STOP]: + cleanup() + break + +The logic following the function call is typically used to break the persistent +generator's main loop and return. + +.. currentmodule:: libensemble.tools.gen_support +.. autofunction:: sendrecv_mgr_worker_msg + +This function performs both of the previous functions in a single statement. Its +usage typically resembles:: + + tag, Work, calc_in = sendrecv_mgr_worker_msg(libE_info['comm'], local_H_out[selected_IDs]) + if tag in [STOP_TAG, PERSIS_STOP]: + cleanup() + break + +Once the persistent generator's loop has been broken because of +the tag from the manager, it should return with an additional tag:: + + return local_H_out, persis_info, FINISHED_PERSISTENT_GEN_TAG + +See :doc:`calc_status<../data_structures/calc_status>` for more information about +the message tags. + +Examples of normal and persistent generator functions +can be found :doc:`here<../examples/gen_funcs>`. diff --git a/docs/function_guides/simulator.rst b/docs/function_guides/simulator.rst new file mode 100644 index 000000000..4b602b61d --- /dev/null +++ b/docs/function_guides/simulator.rst @@ -0,0 +1,45 @@ +Simulator Functions +=================== + +As described in the :ref:`API`, the ``sim_f`` is called by a +libEnsemble worker via a similar interface to the ``gen_f``:: + + out = sim_f(H[sim_specs['in']][sim_ids_from_allocf], persis_info, sim_specs, libE_info) + +In practice, most ``sim_f`` function definitions written by users resemble:: + + def my_simulator(H, persis_info, sim_specs, libE_info): + +Where :doc:`sim_specs<../data_structures/sim_specs>` is a +dictionary containing pre-defined parameters for the ``sim_f``, and the other +parameters serve similar purposes to those in the ``gen_f``. + +The pattern of setting up a local ``H``, parsing out parameters from +``sim_specs``, performing calculations, and returning the local ``H`` +with ``persis_info`` should be familiar:: + + batch_size = sim_specs['user']['batch_size'] + local_H_out = np.zeros(batch_size, dtype=sim_specs['out']) + + ... # Perform simulation calculations + + return local_H_out, persis_info + +Simulator functions can also return a :doc:`calc_status<../data_structures/calc_status>` +integer attribute from the ``libensemble.message_numbers`` module to be logged. + +Descriptions of included simulator functions can be found :doc:`here<../examples/sim_funcs>`. + +The :doc:`Simple Sine tutorial<../tutorials/local_sine_tutorial>` is an +excellent introduction for writing simple user functions and using them +with libEnsemble. + +Executor +-------- + +libEnsemble's Executor is commonly used within simulator functions to launch +and monitor applications. An excellent overview is already available +:doc:`here<../executor/overview>`. + +See the :doc:`Executor with Electrostatic Forces tutorial<../tutorials/executor_forces_tutorial>` +for an additional example to try out. diff --git a/docs/history_output.rst b/docs/history_output.rst index 52e32e66a..766f83e18 100644 --- a/docs/history_output.rst +++ b/docs/history_output.rst @@ -61,34 +61,44 @@ Output Working Directory Structure libEnsemble features configurable output and working directory structuring for storing results at every step of a calculation, or directing workers to perform calculations on separate filesystems or in other directories. This is helpful -for users performing simulations who want to take advantage of high-speed -scratch spaces or disks, or organize their I/O by application run. - -With these features enabled, each time a worker initiates a simulation routine -it automatically enters a configurable directory, either a new directory specific -to that worker and simulation instance or a shared directory for all workers. -Where these directories are created or what files they contain is configurable -through settings in :ref:`libE_specs`. Defining any -compatible settings initiates this system with default settings for unspecified -options. Each setting will be described in detail here: +for users performing simulations or using high-resource generator functions who +want to take advantage of high-speed scratch spaces or disks, or organize their +I/O by application run. + +With these features enabled, each time a worker initiates a user function routine +(``gen_f`` or ``sim_f``) it automatically enters a configurable directory, +either a new directory specific to that worker and function instance or a shared +directory for all workers. Where these directories are created or what files +they contain is configurable through settings in :ref:`libE_specs`. +Defining any compatible settings initiates this system with default settings for +unspecified options. Each setting will be described in detail here: * ``'sim_dirs_make'``: Boolean. Enables per-simulation directories with default - settings. Directories are labeled in the form ``'sim0-worker1'`` and without - further configuration, placed in the ensemble directory ``./ensemble``, - relative to where libEnsemble was launched. Default: ``True``. If ``False``, - all workers will operate within the ensemble directory without producing - per-simulation directories. + settings. Directories are labeled in the form ``'sim0-worker1'``, by sim ID + and initiating worker. Without further configuration, directories are placed + in the ensemble directory ``./ensemble``, relative to where libEnsemble was + launched. Default: ``True`` with other sim_dir options enabled. If + ``False``, all workers will operate within the ensemble directory without + producing per-simulation directories. + +* ``'gen_dirs_make'``: Boolean. Enabled per-generator instance directories with + default settings. Directories are labeled in the form ``'gen1-worker1'``. by + initiating worker and how many times that worker has initiated the generator. + These behave similarly to simulation directories. Default: ``True`` with + other gen_dir options enabled. * ``'ensemble_dir_path'``: This location, typically referred to as the ensemble - directory, is where each worker places its simulation directories. If not - specified, simulation directories are placed in ``./ensemble``, relative to - where libEnsemble was launched. If ``'sim_dirs_make'`` is ``False``, all workers - will run within this directory. On supported systems, writing to local-node - storage is possible and recommended for increased performance.:: + directory, is where each worker places its calculation directories. If not + specified, calculation directories are placed in ``./ensemble``, relative to + where libEnsemble was launched. If ``'sim_dirs_make'`` is ``False``, workers + initiating simulation instances will run within this directory. This behavior + is similar when ``'gen_dirs_make'`` is ``False``. On supported systems, + writing to local-node storage is possible and recommended for increased + performance.:: libE_specs['ensemble_dir_path'] = "/scratch/my_ensemble" -* ``'use_worker_dirs'``: Boolean. Sorts simulation directories into +* ``'use_worker_dirs'``: Boolean. Sorts calculation directories into per-worker directories at runtime. Particularly useful for organization when running with multiple workers on global scratch spaces or the same node, and may produce performance benefits. Default: ``False``. @@ -97,6 +107,7 @@ options. Each setting will be described in detail here: - /ensemble_dir - /sim0-worker1 + - /gen1-worker1 - /sim1-worker2 ... @@ -105,6 +116,7 @@ options. Each setting will be described in detail here: - /ensemble_dir - /worker1 - /sim0 + - /gen1 - /sim4 ... - /worker2 @@ -116,12 +128,19 @@ options. Each setting will be described in detail here: application, this may be helpful for copying over configuration files for each launch. +* ``'gen_dir_copy_files'``: A list of paths for files to copy into generator + directories. If ``'gen_dirs_make'`` is False, these files are copied to the + ensemble directory. + * ``'sim_dir_symlink_files'``: A list of paths for files to symlink into simulation directories. +* ``'gen_dir_symlink_files'``: A list of paths for files to symlink into + generator directories. + * ``'ensemble_copy_back'``: Boolean. Instructs the manager to create an empty - directory where libEnsemble was launched where workers copy back their simulation - directories on a run's conclusion or an exception. Especially useful when + directory where libEnsemble was launched where workers copy back their calculation + directories when a run concludes or an exception occurs. Especially useful when ``'ensemble_dir_path'`` has been set to some scratch space or another temporary location. Default: ``False``. @@ -130,6 +149,11 @@ options. Each setting will be described in detail here: of new simulation directories. If ``'sim_dirs_make'`` is False, this directory's contents are copied into the ensemble directory. +* ``'sim_input_dir'``: A path to a directory to copy for generator + directories. This directory and it's contents are copied to form the base + of new generator directories. If ``'gen_dirs_make'`` is False, this directory's + contents are copied into the ensemble directory. + See the regression tests ``test_sim_dirs_per_calc.py`` and ``test_use_worker_dirs.py`` for examples of many of these settings. See ``test_sim_input_dir_option.py`` for examples of using these settings diff --git a/docs/images/ANL_CMYK.png b/docs/images/ANL_CMYK.png new file mode 100644 index 000000000..a3142fdef Binary files /dev/null and b/docs/images/ANL_CMYK.png differ diff --git a/docs/images/ECP_logo.png b/docs/images/ECP_logo.png new file mode 100755 index 000000000..9d9462464 Binary files /dev/null and b/docs/images/ECP_logo.png differ diff --git a/docs/images/basic_6hc.png b/docs/images/basic_6hc.png new file mode 100644 index 000000000..2506a6133 Binary files /dev/null and b/docs/images/basic_6hc.png differ diff --git a/docs/images/central_balsam.png b/docs/images/central_balsam.png new file mode 100644 index 000000000..d18508297 Binary files /dev/null and b/docs/images/central_balsam.png differ diff --git a/docs/images/centralized_Balsam_ThS.png b/docs/images/centralized_Balsam_ThS.png deleted file mode 100644 index 8ab646c8a..000000000 Binary files a/docs/images/centralized_Balsam_ThS.png and /dev/null differ diff --git a/docs/images/centralized_Balsam_ThS.xml b/docs/images/centralized_Balsam_ThS.xml deleted file mode 100644 index c5256c2f8..000000000 --- a/docs/images/centralized_Balsam_ThS.xml +++ /dev/null @@ -1,136 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/images/centralized_Bb.png b/docs/images/centralized_Bb.png deleted file mode 100644 index fcb0d1ff9..000000000 Binary files a/docs/images/centralized_Bb.png and /dev/null differ diff --git a/docs/images/centralized_Bb.xml b/docs/images/centralized_Bb.xml deleted file mode 100644 index b4dbc89e1..000000000 --- a/docs/images/centralized_Bb.xml +++ /dev/null @@ -1,112 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/images/centralized_MOM_ThS.png b/docs/images/centralized_MOM_ThS.png deleted file mode 100644 index a3489071e..000000000 Binary files a/docs/images/centralized_MOM_ThS.png and /dev/null differ diff --git a/docs/images/centralized_MOM_ThS.xml b/docs/images/centralized_MOM_ThS.xml deleted file mode 100644 index 297c1b0f8..000000000 --- a/docs/images/centralized_MOM_ThS.xml +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/images/centralized_new.png b/docs/images/centralized_new.png new file mode 100644 index 000000000..66dce35b2 Binary files /dev/null and b/docs/images/centralized_new.png differ diff --git a/docs/images/combined_ThS.png b/docs/images/combined_ThS.png deleted file mode 100644 index 651fc4d7e..000000000 Binary files a/docs/images/combined_ThS.png and /dev/null differ diff --git a/docs/images/combined_ThS.xml b/docs/images/combined_ThS.xml deleted file mode 100644 index edf1aec9a..000000000 --- a/docs/images/combined_ThS.xml +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/images/diagram_xml/centralized_new.xml b/docs/images/diagram_xml/centralized_new.xml new file mode 100644 index 000000000..e117931c0 --- /dev/null +++ b/docs/images/diagram_xml/centralized_new.xml @@ -0,0 +1,2 @@ +  \ No newline at end of file diff --git a/docs/images/diagram_xml/distributed_new.xml b/docs/images/diagram_xml/distributed_new.xml new file mode 100644 index 000000000..39509b0eb --- /dev/null +++ b/docs/images/diagram_xml/distributed_new.xml @@ -0,0 +1,2 @@ +  \ No newline at end of file diff --git a/docs/images/diagram_xml/persis_add_worker.xml b/docs/images/diagram_xml/persis_add_worker.xml new file mode 100644 index 000000000..d056d38d0 --- /dev/null +++ b/docs/images/diagram_xml/persis_add_worker.xml @@ -0,0 +1,2 @@ +  \ No newline at end of file diff --git a/docs/images/diagram_xml/persis_wasted_node.xml b/docs/images/diagram_xml/persis_wasted_node.xml new file mode 100644 index 000000000..74a9c154c --- /dev/null +++ b/docs/images/diagram_xml/persis_wasted_node.xml @@ -0,0 +1,2 @@ +  \ No newline at end of file diff --git a/docs/images/distributed_Bb.png b/docs/images/distributed_Bb.png deleted file mode 100644 index 1674eb671..000000000 Binary files a/docs/images/distributed_Bb.png and /dev/null differ diff --git a/docs/images/distributed_Bb.xml b/docs/images/distributed_Bb.xml deleted file mode 100644 index f7a074344..000000000 --- a/docs/images/distributed_Bb.xml +++ /dev/null @@ -1,2 +0,0 @@ - -7VpLd6IwFP41Lp3DwwAuW9tOF23PnOOi7WpOhAiZAmFC8DG/fhIJQng4fWhxrLqQ3JubhPt9l3yAA3MSrb5TmAT3xEPhwNC81cC8GhiGbpsW/xGWdW4Bxig3+BR7slNpmOI/SBo1ac2wh1KlIyMkZDhRjS6JY+QyxQYpJUu125yE6qwJ9FHDMHVh2LQ+Yo8FudUBWmm/RdgPipl1TXoiWHSWhjSAHllWTOb1wJxQQlh+FK0mKBTJK/KSx910eLcLoyhmrwlIHpKltXIe7sHN08/HbHEb3JGhDfJhFjDM5BkPDCvkA14mYs1sLRNh/c7EQi9DHKNhIM/4gnfRbT4tKDvwI1/+boaZ0bqFLzEpbYYyjUFJFntILFnn7mWAGZom0BXeJWcYtwUsCqWbEgYZJjFvarw5x2E4ISGhm6HMORBfbk8ZJS+o4rE2HxFBYlax5x8RUWAlZvFDmKZyinyoggnG9gwWiDK06kRG3+LNCwWRCDG65l1kwNB0JEdkkRTNZck4vaBRUGFb0Q9KkvvboUse8ANJhbfQwuqkhUiZKKQckxL3u8zFHuSeCYlTwuFsMOKBXxy439jJjHz4DnLwFDOVASq0MYlRjQfSBEPsC5q4HBLE7ZcCMMzL/EI6Iux5YppWypWk1PaF+AgoiI+biIMWwI2DAW53Aj4r4Hgk9IXn7l8IzvZT2/ViniPLdduK2bPHM02r1alLIuwWNVst5gMVsK0W8LZaK3haLXiah8KzuDJU8ExxtFdEPIiceSsiluug2XxfqdVqqdVBI7XmZ5aKo59sas2+U9u97ZzVyKepEaAfmRpxujenD6sR/axGjOHIPi414jhvUCO7EfyKagTUr+u9q5FxA08fxftFxHFROyIzB4zA3irFPDI1Mj4ZoddIbd9qZGye1Uj/asQaqbQAr1Qjun4wXowavJBiQmtAdjoawdJUjWD1rBHGzXuF/3RPAdax7SlNvX0qqe17T9E1s5FH5PloKpuEsoD4JIbhdWmtVXPZ546QROb3F2JsLV+fwIwRNftohdmTCP9mANl8rriuVnLoTWNdNGJ+xtUo0X6uOsu4TasI3O4Qucu7EO9hyisbt9xgkbaWXQO8S2vnKRV53E0UnnaSUXmr2MF9eevBIPUR29URtFOPopDvuwt1JQfg0ahfHr2LRgqL9DOLijvd/lgE+mWRfabRXmiUv775AI02oTw1cF3pkBAcs7Qy8g9hqG6vjqoJdaf2CroeYNbEfC2AH+RrKCm9PZkPyJlXPE26hzH0xeOkL/3gyKrdgjtNtfSpz43G3f9OaD4H1M7gqbV1uKd+vFn+byUv0/LfP+b1Xw== \ No newline at end of file diff --git a/docs/images/distributed_Bb_sepM.png b/docs/images/distributed_Bb_sepM.png deleted file mode 100644 index 086c8b1ae..000000000 Binary files a/docs/images/distributed_Bb_sepM.png and /dev/null differ diff --git a/docs/images/distributed_Bb_sepM.xml b/docs/images/distributed_Bb_sepM.xml deleted file mode 100644 index 57c73a0b5..000000000 --- a/docs/images/distributed_Bb_sepM.xml +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/images/distributed_new.png b/docs/images/distributed_new.png new file mode 100644 index 000000000..367bf1539 Binary files /dev/null and b/docs/images/distributed_new.png differ diff --git a/docs/images/libE_logo_only.png b/docs/images/libE_logo_only.png deleted file mode 100644 index 174935a1d..000000000 Binary files a/docs/images/libE_logo_only.png and /dev/null differ diff --git a/docs/images/libE_logo_smaller.png b/docs/images/libE_logo_smaller.png new file mode 100644 index 000000000..7a5df0460 Binary files /dev/null and b/docs/images/libE_logo_smaller.png differ diff --git a/docs/images/libe_opal_complete_v_killed_511w_2044sims_1030nodes.png b/docs/images/libe_opal_complete_v_killed_511w_2044sims_1030nodes.png new file mode 100644 index 000000000..9d89b6ddf Binary files /dev/null and b/docs/images/libe_opal_complete_v_killed_511w_2044sims_1030nodes.png differ diff --git a/docs/images/libe_opal_util_v_time_511w_2044sims_1030nodes.png b/docs/images/libe_opal_util_v_time_511w_2044sims_1030nodes.png new file mode 100644 index 000000000..4847395c9 Binary files /dev/null and b/docs/images/libe_opal_util_v_time_511w_2044sims_1030nodes.png differ diff --git a/docs/images/localopt_6hc.png b/docs/images/localopt_6hc.png new file mode 100644 index 000000000..fd8e318e3 Binary files /dev/null and b/docs/images/localopt_6hc.png differ diff --git a/docs/images/logo_manager_worker.png b/docs/images/logo_manager_worker.png new file mode 100644 index 000000000..823ea9353 Binary files /dev/null and b/docs/images/logo_manager_worker.png differ diff --git a/docs/images/persis_add_worker.png b/docs/images/persis_add_worker.png new file mode 100644 index 000000000..a689da4ac Binary files /dev/null and b/docs/images/persis_add_worker.png differ diff --git a/docs/images/persis_wasted_node.png b/docs/images/persis_wasted_node.png new file mode 100644 index 000000000..fecec3876 Binary files /dev/null and b/docs/images/persis_wasted_node.png differ diff --git a/docs/images/sampling_6hc.png b/docs/images/sampling_6hc.png new file mode 100644 index 000000000..d6adbcaed Binary files /dev/null and b/docs/images/sampling_6hc.png differ diff --git a/docs/images/using_new.png b/docs/images/using_new.png new file mode 100644 index 000000000..05807aa0d Binary files /dev/null and b/docs/images/using_new.png differ diff --git a/docs/images/white.png b/docs/images/white.png new file mode 100644 index 000000000..d04873f1a Binary files /dev/null and b/docs/images/white.png differ diff --git a/docs/index.rst b/docs/index.rst index 28e1dd7d7..50e63bcaf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ tutorials/local_sine_tutorial tutorials/executor_forces_tutorial + tutorials/aposmm_tutorial examples/examples_index .. toctree:: @@ -32,6 +33,7 @@ known_issues release_notes contributing + posters .. toctree:: :maxdepth: 2 diff --git a/docs/introduction_latex.rst b/docs/introduction_latex.rst index 05036c610..00f132ddd 100644 --- a/docs/introduction_latex.rst +++ b/docs/introduction_latex.rst @@ -37,7 +37,8 @@ We now present further information on running and testing libEnsemble. .. _NumPy: http://www.numpy.org .. _petsc4py: https://bitbucket.org/petsc/petsc4py .. _PETSc: http://www.mcs.anl.gov/petsc -.. _poster: https://figshare.com/articles/LibEnsemble_PETSc_TAO-_Sustaining_a_library_for_dynamic_ensemble-based_computations/7765454 +.. _poster: https://figshare.com/articles/libEnsemble_A_Python_Library_for_Dynamic_Ensemble-Based_Computations/12559520 +.. _psutil: https://pypi.org/project/psutil/ .. _PyPI: https://pypi.org .. _pytest-cov: https://pypi.org/project/pytest-cov/ .. _pytest-timeout: https://pypi.org/project/pytest-timeout/ diff --git a/docs/known_issues.rst b/docs/known_issues.rst index f63422caa..f321fde99 100644 --- a/docs/known_issues.rst +++ b/docs/known_issues.rst @@ -4,9 +4,9 @@ Known Issues The following selection describes known bugs, errors, or other difficulties that may occur when using libEnsemble. -* When using the executor: OpenMPI does not work with direct MPI task +* When using the Executor: OpenMPI does not work with direct MPI task submissions in mpi4py comms mode, since OpenMPI does not support nested MPI - executions. Use either local mode or the Balsam executor instead. + executions. Use either local mode or the Balsam Executor instead. * Local comms mode (multiprocessing) may fail if MPI is initialized before forking processors. This is thought to be responsible for issues combining multiprocessing with PETSc on some platforms. @@ -14,7 +14,7 @@ may occur when using libEnsemble. number of physical cores as SMT info not available. * TCP mode does not support (1) more than one libEnsemble call in a given script or - (2) the auto-resources option to the executor. + (2) the auto-resources option to the Executor. * libEnsemble may hang on systems with matching probes not enabled on the native fabric, like on Intel's Truescale (TMI) fabric for instance. See the :doc:`FAQ` for more information. diff --git a/docs/overview_usecases.rst b/docs/overview_usecases.rst index aed057b9c..4306fbca4 100644 --- a/docs/overview_usecases.rst +++ b/docs/overview_usecases.rst @@ -87,13 +87,17 @@ it's capabilities. generation and simulation routines, submit additional tasks for execution, and return results to the manager. +* **Calling Script**: libEnsemble is typically imported, parameterized, and + initiated in a single Python file referred to as a *calling script*. ``sim_f`` + and ``gen_f`` functions are also commonly configured and parameterized here. + * **Submit**: Enqueue or indicate that one or more jobs or tasks needs to be - launched. When using the libEnsemble executor, a *submitted* task is executed + launched. When using the libEnsemble Executor, a *submitted* task is executed immediately or queued for execution. * **Tasks**: Sub-processes or independent units of work. Workers perform *tasks* as directed by the manager; tasks may include submitting external - programs for execution using the executor. + programs for execution using the Executor. * **Persistent**: Typically, a worker communicates with the manager before and after initiating a user ``gen_f`` or ``sim_f`` calculation. However, user diff --git a/docs/platforms/bebop.rst b/docs/platforms/bebop.rst index 707d0702b..eed570ed5 100644 --- a/docs/platforms/bebop.rst +++ b/docs/platforms/bebop.rst @@ -73,9 +73,9 @@ With your nodes allocated, queue your job to start with four MPI ranks:: ``mpirun`` should also work. This line launches libEnsemble with a manager and **three** workers to one allocated compute node, with three nodes available for -the workers to launch calculations with the executor or a launch command. +the workers to launch calculations with the Executor or a launch command. This is an example of running in :doc:`centralized` mode, and, -if using the :doc:`executor<../executor/mpi_executor>`, it should +if using the :doc:`Executor<../executor/mpi_executor>`, it should be initiated with ``central_mode=True`` .. note:: diff --git a/docs/platforms/cori.rst b/docs/platforms/cori.rst index 336ec5c1b..d4c06609f 100644 --- a/docs/platforms/cori.rst +++ b/docs/platforms/cori.rst @@ -120,10 +120,10 @@ With your nodes allocated, queue your job to start with four MPI ranks:: This line launches libEnsemble with a manager and **three** workers to one allocated compute node, with three nodes available for the workers to launch -user applications (via the executor or a direct run command such as ``mpiexec``). +user applications (via the Executor or a direct run command such as ``mpiexec``). This is an example of running in :doc:`centralized` mode; -if using the :doc:`executor<../executor/ex_index>`, it should +if using the :doc:`Executor<../executor/ex_index>`, it should be initiated with ``central_mode=True``. libEnsemble must be run in central mode on Cori because jobs cannot share nodes. diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index b6ceafd3b..b16c82d47 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -13,18 +13,18 @@ The first mode we refer to as **central** mode, where the libEnsemble manager an are grouped on to one or more dedicated nodes. Workers' launch applications on to the remaining allocated nodes: -.. image:: ../images/centralized_Bb.png +.. image:: ../images/centralized_new.png :alt: centralized - :scale: 40 + :scale: 30 :align: center Alternatively, in **distributed** mode, libEnsemble is launched with the processes spread across nodes. The worker processes will share nodes with the applications they launch. There may be multiple nodes per worker, or multiple workers per node: -.. image:: ../images/distributed_Bb.png +.. image:: ../images/distributed_new.png :alt: distributed - :scale: 40 + :scale: 30 :align: center The distributed approach allows the libEnsemble worker to read files produced by the @@ -90,14 +90,14 @@ gen scripts are not doing too much work (other than launching applications). Thi is inherently centralized. The entire node allocation is available for the worker-launched tasks. -To run libEnsemble on the compute nodes of these systems requires an altervative Executor, +To run libEnsemble on the compute nodes of these systems requires an alternative Executor, such as :doc:`Balsam<../executor/balsam_executor>`, which runs on the -launch nodes and launches tasks submitted by workers. Running on compute nodes is potentially -more scalable and will better manage sim and gen functions that contain considerable -computational work or I/O. +launch nodes and launches tasks submitted by workers. Running libEnsemble on the compute +nodes is potentially more scalable and will better manage ``sim_f`` and ``gen_f`` functions +that contain considerable computational work or I/O. - .. image:: ../images/combined_ThS.png - :alt: central_MOM + .. image:: ../images/central_balsam.png + :alt: central_balsam :scale: 40 :align: center @@ -128,12 +128,42 @@ as follows:: Resource detection can be disabled by initializing the Executor with the argument ``auto_resources=False``, and users' can simply supply run -configuration on the executor submit line. This will usually work sufficiently on systems that +configuration on the Executor submit line. This will usually work sufficiently on systems that have application level scheduling (e.g: ``aprun``, ``jsrun``) as these will slot each run into available nodes where possible. ``jsrun`` can also queue runs. However, on other cluster and multi-node systems, if auto-resources is disabled, then runs without a hostlist or machinefile supplied may be undesirably scheduled to the same nodes. +Zero-resource workers +~~~~~~~~~~~~~~~~~~~~~ + +Users with persistent ``gen_f`` functions may notice that the persistent workers +are still automatically assigned system resources. This can be wasteful since those +workers only run ``gen_f`` routines in-place and don't use the Executor to submit +applications to allocated nodes: + +.. image:: ../images/persis_wasted_node.png + :alt: persis_wasted_node + :scale: 40 + :align: center + +This can be resolved within the Executor definition in the calling script. Set the +parameter ``zero_resource_workers`` to a list of worker IDs that shouldn't have +system resources assigned. For example, when using a single instance of Persistent +:doc:`APOSMM<../examples/aposmm>` as your ``gen_f``, the Executor definition +may resemble:: + + exctr = MPIExecutor(central_mode=True, zero_resource_workers=[1]) + +Worker 1 will now not be allocated resources. Note that additional worker +processes can be added to take advantage of the free resources (if using the +same resource set) for simulation instances: + +.. image:: ../images/persis_add_worker.png + :alt: persis_add_worker + :scale: 40 + :align: center + Overriding Auto-detection ------------------------- diff --git a/docs/platforms/summit.rst b/docs/platforms/summit.rst index 438083c27..85c348aa2 100644 --- a/docs/platforms/summit.rst +++ b/docs/platforms/summit.rst @@ -58,6 +58,60 @@ Or, you can install via ``conda``: See :doc:`here<../advanced_installation>` for more information on advanced options for installing libEnsemble. +Special note on resource sets and Executor submit options +--------------------------------------------------------- + +When using the portable MPI run configuration options (e.g., num_nodes) to the +:doc:`MPIExecutor<../executor/mpi_executor>` ``submit`` function, it is important +to note that, due to the `resource sets`_ used on Summit, the options refer to +resource sets as follows: + +- num_procs (int, optional) – The total number resource sets for this run. + +- num_nodes (int, optional) – The number of nodes on which to submit the run. + +- ranks_per_node (int, optional) – The number of resource sets per node. + +It is recommended that the user defines a resource set as the minimal configuration +of CPU cores/processes and GPUs. These can be added to the ``extra_args`` option +of the *submit* function. Alternatively, the portable options can be ignored and +everything expressed in ``extra_args``. + +For example, the following *jsrun* line would run three resource sets, +each having one core (with one process), and one GPU, along with some extra options:: + + jsrun -n 3 -a 1 -g 1 -c 1 --bind=packed:1 --smpiargs="-gpu" + +To express this line in the ``submit`` function may look +something like the following:: + + exctr = Executor.executor + task = exctr.submit(app_name='mycode', + num_procs=3, + extra_args='-a 1 -g 1 -c 1 --bind=packed:1 --smpiargs="-gpu"' + app_args="-i input") + +This would be equivalent to:: + + exctr = Executor.executor + task = exctr.submit(app_name='mycode', + extra_args='-n 3 -a 1 -g 1 -c 1 --bind=packed:1 --smpiargs="-gpu"' + app_args="-i input") + +The auto-resources in the Executor works out the resources available to each worker, +but unlike some other systems, ``jsrun`` on Summit dynamically schedules runs to +available slots across and within nodes. It can also queue tasks. This allows variable +size runs to easily be handled on Summit. If these runs over-use the auto-resource +allocations, auto_resources can be turned off in the Executor setup. E.g: In the +calling script:: + + from libensemble.executors.mpi_executor import MPIExecutor + exctr = MPIExecutor(central_mode=True, auto_resources=False) + +In the above example, the task being submitted used three GPUs, which is half those +available on a Summit node, and thus two such tasks may be allocated to each node +(from different workers), if they were running at the same time. + Job Submission -------------- @@ -153,3 +207,4 @@ See the OLCF guides_ for more information about Summit. .. _guides: https://www.olcf.ornl.gov/for-users/system-user-guides/summit/ .. _conda: https://conda.io/en/latest/ .. _mpi4py: https://mpi4py.readthedocs.io/en/stable/ +.. _resource sets: https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun \ No newline at end of file diff --git a/docs/platforms/theta.rst b/docs/platforms/theta.rst index 0d14ff08e..b5bd81ddd 100644 --- a/docs/platforms/theta.rst +++ b/docs/platforms/theta.rst @@ -9,7 +9,7 @@ Theta features three tiers of nodes: login, MOM, and compute nodes. Users on login nodes submit batch jobs to the MOM nodes. MOM nodes execute user batch scripts to run on the compute nodes via ``aprun``. -Theta does not allow more than one MPI application per compute node. +Theta will not schedule more than one MPI application per compute node. Configuring Python ------------------ @@ -18,10 +18,11 @@ Begin by loading the Python 3 Miniconda_ module:: $ module load miniconda-3/latest -Create a conda_ virtual environment, cloning the base environment. This -environment will contain mpi4py_ and many other packages you may find useful:: +Create a conda_ virtual environment. We recommend cloning the base +environment. This environment will contain mpi4py_ and many other packages that +are configured correctly for Theta:: - $ conda create --name my_env --clone $MINICONDA_INSTALL_PATH + $ conda create --name my_env --clone $CONDA_PREFIX .. note:: The "executing transaction" step of creating your new environment may take a while! @@ -32,8 +33,20 @@ instructions to configure your shell with conda. Activate your virtual environment with :: + $ export PYTHONNOUSERSITE=1 $ conda activate my_env +Alternative +^^^^^^^^^^^ + +If you do not wish to clone the miniconda environment and instead create your own, and +you are using ``mpi4py`` make sure the install picks up Cray's compiler drivers. E.g:: + + $ conda create --name my_env python=3.7 + $ export PYTHONNOUSERSITE=1 + $ conda activate my_env + $ CC=cc MPICC=cc pip install mpi4py --no-binary mpi4py + More information_ on using conda on Theta is also available. Installing libEnsemble and Balsam @@ -53,7 +66,7 @@ Your prompt should be similar to the following line: .. note:: If you encounter pip errors, run ``python -m pip install --upgrade pip`` first. -Or, you can install via ``conda``: +Or, you can install via ``conda`` (which comes with some common dependencies): .. code-block:: console @@ -66,68 +79,66 @@ for installing libEnsemble. Balsam (Optional) ^^^^^^^^^^^^^^^^^ -Balsam_ is an ALCF Python utility for coordinating and executing workflows of -computations on systems such as Theta. Balsam can stage in tasks to a database hosted -on a MOM node and submit these tasks dynamically to the compute nodes. libEnsemble -can also be submitted to Balsam for centralized execution on a compute-node. -libEnsemble can then submit tasks to Balsam through libEnsemble's Balsam -executor for execution on additional allocated nodes. +Balsam_ allows libEnsemble to be run on compute nodes, and still submit tasks +from workers (see Job Submission below). The Balsam Executor will stage in tasks +to a database hosted on a MOM node, which will submit these tasks dynamically to +the compute nodes. -Load the Balsam module with :: +Balsam can be installed with:: - $ module load balsam/0.3.5.1 + pip install balsam-flow -Initialize a new database similarly to the following (from the Balsam docs): +Initialize a Balsam database at a location of your choice. E.g:: -.. code-block:: bash + balsam init ~/myWorkflow - $ balsam init ~/libe-workflow - $ source balsamactivate libe-workflow - $ balsam app --name libe-app --executable "calling.py" - $ balsam job --name libe-job --workflow test --application libe-app --args "hello!" - $ balsam submit-launch -A [project] -q default -t 5 -n 1 --job-mode=mpi - $ watch balsam ls # follow status in realtime from command-line +Further notes on using Balsam: + +* Call ``balsamactivate`` in the batch script (see below). Make sure no active postgres databases are running on either login or MOM nodes before calling ``qsub``. You can check with the script ps_nodes_. + +* Balsam requires PostgreSQL version 9.6.4 or later, but problems may be encountered when using the default ``pg_ctl`` and PostgreSQL 10.12 installation installed in ``/usr/bin``. This may be resolved by loading the postgresql/9.6.12 modules within submission scripts that use Balsam. + +* By default there are a maximum of 128 concurrent database connections. Each worker will use a connection and a few extra are needed. Increase the number of connections by appending a new ``max_connections=`` line to ``balsamdb/postgresql.conf`` in the database directory. E.g.~ ``max_connections=1024`` + +* There is a Balsam module available (balsam/0.3.8), but the module's Python installation supersedes others when loaded. In practice, libEnsemble or other Python packages installed into another environment become inaccessible. Installing Balsam into a separate Python virtual environment is recommended instead. Read Balsam's documentation here_. .. note:: - Balsam will create the run directories inside the data subdirectory within the database - directory. From here, files can be staged out to the user directory (see the example - batch script below). + Balsam creates run-specific directories inside ``data/my_workflow`` in the database + directory. For example: ``$HOME/my_balsam_db/data/libe_workflow/job_run_libe_forces_b7073fa9/``. + From here, files can be staged out (see the example batch script below). Job Submission -------------- -Theta uses Cobalt_ for job management and submission. For libEnsemble, the most -important command is ``qsub``, for submitting batch scripts from the login nodes -to execute on the MOM nodes. - On Theta, libEnsemble can be launched to two locations: 1. **A MOM Node**: All of libEnsemble's manager and worker processes - run on a front-end MOM node. libEnsemble's MPI executor takes + run centrally on a front-end MOM node. libEnsemble's MPI Executor takes responsibility for direct user-application submission to allocated compute nodes. libEnsemble must be configured to run with *multiprocessing* communications, since mpi4py isn't configured for use on the MOM nodes. 2. **The Compute Nodes**: libEnsemble is submitted to Balsam, and all manager - and worker processes are tasked to a back-end compute node. libEnsemble's - Balsam executor interfaces with Balsam running on a MOM node for dynamic + and worker processes are tasked to a back-end compute node and run centrally. libEnsemble's + Balsam Executor interfaces with Balsam running on a MOM node for dynamic user-application submission to the compute nodes. - .. image:: ../images/combined_ThS.png - :alt: central_MOM + .. image:: ../images/central_balsam.png + :alt: central_Balsam :scale: 40 :align: center -When considering on which nodes to run libEnsemble, consider whether your user -functions execute computationally expensive code or code built for specific -architectures. Recall also that only the MOM nodes can launch MPI applications. +When considering on which nodes to run libEnsemble, consider whether your ``sim_f`` +or ``gen_f`` user functions (not applications) execute computationally expensive +code, or code built specifically for the compute node architecture. Recall also +that only the MOM nodes can launch MPI applications. Although libEnsemble workers on the MOM nodes can technically submit user applications to the compute nodes directly via ``aprun`` within user functions, it is highly recommended that the aforementioned :doc:`executor<../executor/overview>` -interface be used instead. The libEnsemble executor features advantages such as +interface be used instead. The libEnsemble Executor features advantages such as automatic resource detection, portability, launch failure resilience, and ease of use. Theta features one default production queue, ``default``, and two debug queues, @@ -185,15 +196,15 @@ convenience function from libEnsemble's :doc:`tools module<../utilities>`. # --- Prepare Python --- - # Load conda module - module load miniconda-3/latest + # Obtain Conda PATH from miniconda-3/latest module + CONDA_DIR=/soft/datascience/conda/miniconda3/latest/bin # Name of conda environment export CONDA_ENV_NAME=my_env # Activate conda environment export PYTHONNOUSERSITE=1 - source activate $CONDA_ENV_NAME + source $CONDA_DIR/activate $CONDA_ENV_NAME # --- Prepare libEnsemble --- @@ -206,12 +217,7 @@ convenience function from libEnsemble's :doc:`tools module<../utilities>`. # Number of workers. export NWORKERS='--nworkers 128' - # Conda location - theta specific - export PATH=/home/user/path/to/packages/:$PATH - export LD_LIBRARY_PATH=/home/user/path/to/packages/:$LD_LIBRARY_PATH - export PYTHONPATH=/home/user/path/to/env/packages:$PYTHONPATH - - # Required for python kills on Theta + # Required for killing tasks from workers on Theta export PMI_NO_FORK=1 # Unload Theta modules that may interfere with task monitoring/kills @@ -229,14 +235,17 @@ libEnsemble on Theta is achieved by running :: Balsam Runs ^^^^^^^^^^^ -Here is an example Balsam submission script: +Here is an example Balsam submission script. It requires a pre-initialized (but not activated) +postgresql_ database. Note, the example runs libEnsemble over two dedicated nodes, reserving the +other 127 nodes for launched applications. libEnsemble is run with MPI on 128 processors +(one manager and 127 workers).: .. code-block:: bash #!/bin/bash -x #COBALT -t 60 #COBALT -O libE_test - #COBALT -n 128 + #COBALT -n 129 #COBALT -q default #COBALT -A [project] @@ -244,7 +253,10 @@ Here is an example Balsam submission script: export EXE=calling_script.py # Number of workers. - export NUM_WORKERS=128 + export NUM_WORKERS=127 + + # Number of nodes to run libE + export LIBE_NODES=2 # Wall-clock for entire libE run (supplied to Balsam) export LIBE_WALLCLOCK=45 @@ -252,20 +264,15 @@ Here is an example Balsam submission script: # Name of working directory where Balsam places running jobs/output export WORKFLOW_NAME=libe_workflow - #Tell libE manager to stop workers, dump timing.dat and exit after time. - export SCRIPT_ARGS=$(($LIBE_WALLCLOCK-3)) + # If user script takes ``elapsed_wallclock_time`` argument. + # export SCRIPT_ARGS=$(($LIBE_WALLCLOCK-3)) + export SCRIPT_ARGS="" # Name of conda environment export CONDA_ENV_NAME=my_env + export BALSAM_DB_NAME=myWorkflow - # Conda location - theta specific - export PATH=/path/to/python/bin:$PATH - export LD_LIBRARY_PATH=~/path/to/conda/env/lib:$LD_LIBRARY_PATH - - #Ensure environment isolated - export PYTHONNOUSERSITE=1 - - # Required for python kills on Theta + # Required for killing tasks from workers on Theta export PMI_NO_FORK=1 # Unload Theta modules that may interfere with task monitoring/kills @@ -273,17 +280,25 @@ Here is an example Balsam submission script: module unload darshan module unload xalt + # Obtain Conda PATH from miniconda-3/latest module + CONDA_DIR=/soft/datascience/conda/miniconda3/latest/bin + + # Ensure environment is isolated + export PYTHONNOUSERSITE=1 + # Activate conda environment - . activate $CONDA_ENV_NAME + source $CONDA_DIR/activate $CONDA_ENV_NAME # Activate Balsam database - . balsamactivate default + source balsamactivate $BALSAM_DB_NAME # Currently need at least one DB connection per worker (for postgres). - if [[ $NUM_WORKERS -gt 128 ]] + if [[ $NUM_WORKERS -gt 100 ]] then - #Add a margin - echo -e "max_connections=$(($NUM_WORKERS+10)) #Appended by submission script" >> $BALSAM_DB_PATH/balsamdb/postgresql.conf + # Add a margin + export BALSAM_DB_PATH=~/$BALSAM_DB_NAME # Pre-pend with PATH + echo -e "max_connections=$(($NUM_WORKERS+20)) # Appended by submission script" \ + >> $BALSAM_DB_PATH/balsamdb/postgresql.conf fi wait @@ -294,18 +309,28 @@ Here is an example Balsam submission script: sleep 3 # Add calling script to Balsam database as app and job. - THIS_DIR=$PWD - SCRIPT_BASENAME=${EXE%.*} + export THIS_DIR=$PWD + export SCRIPT_BASENAME=${EXE%.*} + + export LIBE_PROCS=$((NUM_WORKERS+1)) # Manager and workers + export PROCS_PER_NODE=$((LIBE_PROCS/LIBE_NODES)) # Must divide evenly balsam app --name $SCRIPT_BASENAME.app --exec $EXE --desc "Run $SCRIPT_BASENAME" - # Running libE on one node - one manager and upto 63 workers - balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes 1 --ranks-per-node $((NUM_WORKERS+1)) --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" --url-in="local:/$THIS_DIR/*" --yes + balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME \ + --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS \ + --wall-time-minutes $LIBE_WALLCLOCK \ + --num-nodes $LIBE_NODES --ranks-per-node $PROCS_PER_NODE \ + --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" \ + --url-in="local:/$THIS_DIR/*" --yes - #Run job + # Run job balsam launcher --consume-all --job-mode=mpi --num-transition-threads=1 - . balsamdeactivate + wait + source balsamdeactivate + +Further examples of Balsam submission scripts can be be found in the :doc:`examples`. Debugging Strategies -------------------- @@ -331,6 +356,9 @@ Read the documentation for Balsam here_. .. _Cobalt: https://www.alcf.anl.gov/support-center/theta/submit-job-theta .. _`Support Center`: https://www.alcf.anl.gov/support-center/theta .. _here: https://balsam.readthedocs.io/en/latest/ +.. .. _Balsam install: https://balsam.readthedocs.io/en/latest/#quick-setup +.. _ps_nodes: https://github.com/Libensemble/libensemble/blob/develop/examples/misc/ps_nodes.sh +.. _postgresql: https://www.alcf.anl.gov/support-center/theta/postgresql-and-sqlite .. _Miniconda: https://docs.conda.io/en/latest/miniconda.html .. _conda: https://conda.io/en/latest/ .. _information: https://www.alcf.anl.gov/user-guides/conda diff --git a/docs/posters.rst b/docs/posters.rst new file mode 100644 index 000000000..cbc50d094 --- /dev/null +++ b/docs/posters.rst @@ -0,0 +1,24 @@ +Posters and Presentations +========================= + +SciPy 2020 Virtual Poster +------------------------- + +.. toctree:: + :maxdepth: 3 + + scipy2020 + +SciPy 2020 PDF Poster +--------------------- + +.. raw:: html + + + +CSE 2019 Poster +--------------- + +.. raw:: html + + diff --git a/docs/programming_libE.rst b/docs/programming_libE.rst index 121dc1540..874fced1f 100644 --- a/docs/programming_libE.rst +++ b/docs/programming_libE.rst @@ -12,6 +12,7 @@ We now give greater detail in programming with libEnsemble. :caption: libEnsemble User Functions: sim_gen_alloc_funcs + function_guides/function_guide_index .. toctree:: executor/ex_index diff --git a/docs/requirements.txt b/docs/requirements.txt index c6ce35f68..662dda543 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,3 @@ sphinx==2.4.1 sphinxcontrib-bibtex +ipykernel diff --git a/docs/running_libE.rst b/docs/running_libE.rst index ea726160a..bb16321b1 100644 --- a/docs/running_libE.rst +++ b/docs/running_libE.rst @@ -103,8 +103,14 @@ The ``libE_specs`` options for TCP are:: 'authkey' [String]: Authkey. +Limitations of TCP mode +^^^^^^^^^^^^^^^^^^^^^^^ + +- There cannot be two calls to ``libE`` in the same script. + Persistent Workers ------------------ +.. _persis_worker: In a regular (non-persistent) worker, the user's generator or simulation function is called whenever the worker receives work. A persistent worker is one that continues to run the generator or simulation function between work units, diff --git a/docs/scipy2020.rst b/docs/scipy2020.rst new file mode 100644 index 000000000..8fbd0c98e --- /dev/null +++ b/docs/scipy2020.rst @@ -0,0 +1,204 @@ +.. image:: images/ECP_logo.png + :alt: ECP + :width: 23 % + :align: left + +.. image:: images/ANL_CMYK.png + :alt: ANL + :width: 33 % + :align: right + +.. image:: images/white.png + :align: center + :width: 33 % + :height: 1.2 in + +========================================================================= +**libEnsemble**: A Python Library for Dynamic Ensemble-Based Computations +========================================================================= + +*David Bindel, Stephen Hudson, Jeffrey Larson, John-Luke Navarro and Stefan Wild* + +A PDF poster version of this content is available on figshare_. + +.. _FigShare: https://figshare.com/articles/libEnsemble_A_Python_Library_for_Dynamic_Ensemble-Based_Computations/12559520 + +Overview +-------- + +**libEnsemble** is a Python library for coordinating the concurrent evaluation of +dynamic ensembles of calculations. The library is developed to use massively +parallel resources to accelerate the solution of design, decision, and +inference problems and to expand the class of problems that can benefit from +increased concurrency levels. + +libEnsemble aims for the following: + +• Extreme scaling +• Resilience/fault tolerance +• Monitoring/killing of tasks (and recovering resources) +• Portability and flexibility +• Exploitation of persistent data/control flow + +libEnsemble is most commonly used to coordinate large numbers of parallel +instances (ensembles) of simulations at huge scales. + +Using libEnsemble +----------------- + +The user selects or supplies a ``gen_f`` function that generates simulation +input and a ``sim_f`` function that performs and monitors simulations. The user +parameterizes these functions and initiates libEnsemble in a *calling script*. +Examples and templates of such scripts and functions are included in the library. + +.. image:: images/using_new.png + :alt: Using libEnsemble + :scale: 30 % + :align: center + +For example, the ``gen_f`` may contain an optimization routine to generate new +simulation parameters on-the-fly based on results from previous ``sim_f`` +simulations. + +Other potential use-cases include: + +==================== ===================== +Generator Functions: Simulation Functions: +==================== ===================== +Parameter estimation Particle-in-cell +Surrogate models Subsurface flow +Sensitivity analysis PETSc simulations +Design optimization DFT simulations +Supervised learning Quantum chemistry +==================== ===================== + +Manager and Workers +------------------- + +libEnsemble employs a manager/worker scheme that can communicate through **MPI**, +Python's **multiprocessing**, or **TCP**. Each *worker* +can control and monitor any level of work, from small sub-node tasks to huge +many-node simulations. The *manager* allocates workers to asynchronously execute +``gen_f`` generation functions and ``sim_f`` simulation functions based on +produced output, directed by a provided ``alloc_f`` allocation function. + +.. image:: images/logo_manager_worker.png + :alt: Managers and Workers + :align: center + :scale: 40 % + +Flexible Run Mechanisms +----------------------- + +libEnsemble has been developed, supported, and tested on systems of highly +varying scales, from laptops to machines with thousands of compute nodes. +On multi-node systems, there are two basic modes of configuring libEnsemble to +run and launch tasks (user applications) on available nodes. + +* **Distributed**: Workers are distributed across allocated nodes and launch tasks in-place. Workers share nodes with their applications. + +.. image:: images/distributed_new.png + :alt: Distributed + :align: center + :scale: 30 % + +* **Centralized**: Workers run on one or more dedicated nodes and launch tasks to the remaining allocated nodes. + +.. image:: images/centralized_new.png + :alt: Centralized + :align: center + :scale: 30 % + +.. note:: + Dividing up workers and tasks to allocated nodes is highly configurable. + Multiple workers (and thus multiple tasks or user function instances) can be + assigned to a single node. Alternatively, multiple nodes may be assigned to + a single worker and each routine it performs. + +Executor Module +--------------- + +An *Executor* interface is provided to ensure libEnsemble routines that +coordinate user applications are portable, resilient, and flexible. The Executor +automatically detects allocated nodes and available cores and can split up tasks +if resource data isn't supplied. + +The Executor is agnostic of both the job launch/management system and selected +manager/worker communication method on each machine. The main functions are +``submit()``, ``poll()``, and ``kill()``. + +On machines that do not support launches from compute nodes, libEnsemble's +Executor can interface with the **Balsam** library, which functions as a proxy +job launcher that maintains and submits jobs from a database on front end launch +nodes. + +.. image:: images/central_balsam.png + :alt: Central Balsam + :align: center + :scale: 40 % + +Supported Research Machines +--------------------------- + +libEnsemble is tested and supported on the following high-performance research machines: + +======== ======================================================= ======== ================================================ +Machine Location Facility Info +======== ======================================================= ======== ================================================ +Summit_ `Oak Ridge National Laboratory`_ OLCF_ IBM AC922, IBM POWER9 nodes w/ NVIDIA Volta GPUs +Theta_ `Argonne National Laboratory`_ ALCF_ Cray XC40, Intel KNL nodes +Cori_ `National Energy Research Scientific Computing Center`_ Cray XC40, Intel Haswell & KNL nodes +Bridges_ `Pittsburgh Supercomputing Center`_ HPE, Intel Haswell nodes, NVIDIA GPU nodes +======== ======================================================= ======== ================================================ + +.. _Summit: https://www.olcf.ornl.gov/olcf-resources/compute-systems/summit/ +.. _Theta: https://www.alcf.anl.gov/alcf-resources/theta +.. _Cori: https://docs.nersc.gov/systems/cori/ +.. _Bridges: https://www.psc.edu/resources/computing/bridges + +.. _`Oak Ridge National Laboratory`: https://www.ornl.gov/ +.. _`Argonne National Laboratory`: https://www.anl.gov/ +.. _`National Energy Research Scientific Computing Center`: https://www.nersc.gov/ +.. _`Pittsburgh Supercomputing Center`: https://www.psc.edu/ + +.. _OLCF: https://www.olcf.ornl.gov/ +.. _ALCF: https://www.alcf.anl.gov/ + +Running at Scale +---------------- + +**OPAL Simulations** + +* ALCF/Theta (Cray XC40) with Balsam, at Argonne National Laboratory +* 1030 node allocation, 511 workers, MPI communications. +* 2044 2-node simulations +* Object Oriented Parallel Accelerator Library (OPAL) simulation functions. + +.. list-table:: + + * - .. figure:: images/libe_opal_complete_v_killed_511w_2044sims_1030nodes.png + + Histogram of completed and killed simulations, binned by run time. + + * - .. figure:: images/libe_opal_util_v_time_511w_2044sims_1030nodes.png + + Total number of Balsam-launched applications running over time. + +Try libEnsemble Online +---------------------- + +Try libEnsemble online with two Jupyter notebook examples. + +The first notebook demonstrates the basics of parallel ensemble calculations +with libEnsemble through a Simple Functions Tutorial. The second notebook, an +Executor Tutorial, contains an example similar to most use-cases: simulation +functions that launch and coordinate user applications. + +.. note:: + The Executor Tutorial notebook may take a couple minutes to initiate. + +.. image:: https://img.shields.io/badge/libEnsemble-Simple%20Functions%20Tutorial-579ACA.svg?logo= + :target: https://mybinder.org/v2/gh/Libensemble/libensemble/develop?filepath=examples%2Ftutorials%2Fsine_tutorial_notebook.ipynb + +.. image:: https://img.shields.io/badge/libEnsemble-Executor%20Tutorial-E66581.svg?logo= + :target: https://mybinder.org/v2/gh/Libensemble/libensemble/develop?filepath=examples%2Ftutorials%2Fforces_tutorial_notebook.ipynb diff --git a/docs/tutorials/aposmm_tutorial.rst b/docs/tutorials/aposmm_tutorial.rst new file mode 100644 index 000000000..6614ad2b5 --- /dev/null +++ b/docs/tutorials/aposmm_tutorial.rst @@ -0,0 +1,292 @@ +================================= +Parallel Optimization with APOSMM +================================= + +This tutorial demonstrates libEnsemble's capability to identify multiple minima +of simulation output using the built-in :doc:`APOSMM<../examples/aposmm>` +(Asynchronously Parallel Optimization Solver for finding Multiple Minima) +:ref:`gen_f`. In this tutorial, we'll create a simple +simulation :ref:`sim_f` that defines a function with +multiple minima, then write a libEnsemble calling script that imports APOSMM and +parameterizes it to check for minima over a domain of outputs from our ``sim_f``. + +Six-Hump Camel Simulation Function +---------------------------------- + +Describing APOSMM's operations is simpler with a given function on which to +depict evaluations. We'll use the `Six-Hump Camel function`_, known to have six +global minima. A sample space of this function, containing all minima, appears +below: + +.. image:: ../images/basic_6hc.png + :alt: Six-Hump Camel + :scale: 60 + :align: center + +Create a new Python file named ``six_hump_camel.py``. This will be our +``sim_f``, incorporating the above function. Write the following: + +.. code-block:: python + :linenos: + + import numpy as np + + def six_hump_camel(H, persis_info, sim_specs, _): + """Six-Hump Camel sim_f.""" + + batch = len(H['x']) # Num evaluations each sim_f call. + H_o = np.zeros(batch, dtype=sim_specs['out']) # Define output array H + + for i, x in enumerate(H['x']): + H_o['f'][i] = three_hump_camel_func(x) # Function evaluations placed into H + + return H_o, persis_info + + def six_hump_camel_func(x): + """ Six-Hump Camel function definition """ + x1 = x[0] + x2 = x[1] + term1 = (4-2.1*x1**2+(x1**4)/3) * x1**2 + term2 = x1*x2 + term3 = (-4+4*x2**2) * x2**2 + + return term1 + term2 + term3 + +APOSMM Operations +----------------- + +APOSMM coordinates multiple local optimization runs starting from a collection +of sample points. These local optimization runs occur concurrently, +and can incorporate a variety of optimization methods, including from NLopt_, +`PETSc/TAO`_, SciPy_, or other external scripts. + +Before APOSMM can start local optimization runs, some number of uniformly +sampled points must be evaluated (if no prior simulation evaluations are +provided). User-requested sample points can also be provided to APOSMM: + +.. image:: ../images/sampling_6hc.png + :alt: Six-Hump Camel Sampling + :scale: 60 + :align: center + +Specifically, APOSMM will begin local optimization runs from evaluated points that +don't have points with smaller function values nearby (within a threshold +``r_k``). For the above example, after APOSMM receives the evaluations of the +uniformly sampled points, it will begin at most ``max_active_runs`` local +optimization runs. + +As function values are returned to APOSMM, APOSMM gives them to each local +optimization run in order to generate the next point(s); these are returned to +the manager to be evaluated by the simulation routine. As runs complete (a +minimum is found, or some termination criteria for the local optimization run +is satisfied), +additional local optimization runs may be started or additional uniformly +sampled points may be evaluated. This continues until a ``STOP_TAG`` is sent by +the manager, for example when the budget of simulation evaluations has been +exhausted, or when a sufficiently "good" simulation output has been observed. + +.. image:: ../images/localopt_6hc.png + :alt: Six-Hump Camel Local Optimization Points + :scale: 60 + :align: center + +Throughout, generated and evaluated points are appended to the +:ref:`History` array, with the field +``'local_pt'`` being ``True`` if the point is part of a local optimization run, +and ``'local_min'`` being ``True`` if the point has been ruled a local minimum. + +APOSMM Persistence +------------------ + +The most recent version of APOSMM included with libEnsemble is referred to as +Persistent APOSMM. Unlike most other user functions that are initiated and +completed by workers multiple times based on allocation, a single worker process +initiates APOSMM so that it "persists" and keeps running over the course of the +entire libEnsemble routine. APOSMM begins it's own parallel evaluations and +communicates points back and forth with the manager, which are then given to +workers and evaluated by simulation routines. + +In practice, since a single worker becomes "persistent" for APOSMM, users must +ensure that enough workers or MPI ranks are initiated to +support libEnsemble's manager, a persistent worker to run APOSMM, and +simulation routines. The following:: + + mpiexec -n 3 python my_aposmm_routine.py + +results in only one worker process available to perform simulation routines. + +Calling Script +-------------- + +Create a new Python file named ``my_first_aposmm.py``. Start by importing NumPy, +libEnsemble routines, APOSMM, our ``sim_f``, and a specialized allocation +function: + +.. code-block:: python + :linenos: + + import numpy as np + + from six_hump_camel import six_hump_camel + + from libensemble.libE import libE + from libensemble.gen_funcs.persistent_aposmm import aposmm + from libensemble.alloc_funcs.persistent_aposmm_alloc import persistent_aposmm_alloc + from libensemble.tools import parse_args, add_unique_random_streams + +This allocation function starts a single Persistent APOSMM routine and provides +``sim_f`` output for points requested by APOSMM. Points can be sampled points +or points from local optimization runs. + +APOSMM supports a wide variety of external optimizers. The following statements +set optimizer settings to ``'scipy'`` to indicate to APOSMM which optimization +method to use, and help prevent unnecessary imports or package installations: + +.. code-block:: python + :linenos: + + import libensemble.gen_funcs + libensemble.gen_funcs.rc.aposmm_optimizers = 'scipy' + +Set up :doc:`parse_args()<../utilities>`, +our :doc:`sim_specs<../data_structures/sim_specs>`, +:doc:`gen_specs<../data_structures/gen_specs>`, +and :doc:`alloc_specs<../data_structures/alloc_specs>`: + +.. code-block:: python + :linenos: + + nworkers, is_master, libE_specs, _ = parse_args() + + sim_specs = {'sim_f': six_hump_camel, # Simulation function + 'in': ['x'], # Accepts 'x' values + 'out': [('f', float)]} # Returns f(x) values + + gen_out = [('x', float, 2), # Produces 'x' values + ('x_on_cube', float, 2), # 'x' values scaled to unit cube + ('sim_id', int), # Produces sim_id's for History array indexing + ('local_min', bool), # Is a point a local minimum? + ('local_pt', bool)] # Is a point from a local opt run? + + gen_specs = {'gen_f': aposmm, # APOSMM generator function + 'in': [], + 'out': gen_out, # Output defined like above dict + 'user': {'initial_sample_size': 100, # Random sample 100 points to start + 'localopt_method': 'scipy_Nelder-Mead', + 'opt_return_codes': [0], # Status integers specific to localopt_method + 'max_active_runs': 6, # Occur in parallel + 'lb': np.array([-2, -1]), # Lower bound of search domain + 'ub': np.array([2, 1])} # Upper bound of search domain + } + + alloc_specs = {'alloc_f': persistent_aposmm_alloc, + 'out': [('given_back', bool)], 'user': {}} + +``gen_specs['user']`` fields above that are required for APOSMM are: + + * ``'lb'`` - Search domain lower bound + * ``'ub'`` - Search domain upper bound + * ``'localopt_method'`` - Chosen local optimization method + * ``'initial_sample_size'`` - Number of uniformly sampled points generated + before local optimization runs. + * ``'opt_return_codes'`` - A list of integers that local optimization + methods return when a minimum is detected. SciPy's Nelder-Mead returns 0, + but other methods (not used in this tutorial) return 1. + +Also note the following: + + * ``gen_specs['in']`` is empty. For other ``gen_f``'s this defines what + fields to give to the ``gen_f`` when called, but here APOSMM's + ``alloc_f`` defines those fields. + * ``'x_on_cube'`` in ``gen_specs['out']``. APOSMM works internally on + ``'x'`` values scaled to the unit cube. To avoid back-and-forth scaling + issues, both types of ``'x'``'s are communicated back, even though the + simulation will likely use ``'x'`` values. (APOSMM performs handshake to + ensure that the ``x_on_cube`` that was given to be evaluated is the same + the one that is given back.) + * ``'sim_id'`` in ``gen_specs['out']``. APOSMM produces points in it's + local History array that it will need to update later, and can best + reference those points (and avoid a search) if APOSMM produces the IDs + itself, instead of libEnsemble. + +Other options and configurations for APOSMM can be found in the +APOSMM :doc:`API reference<../examples/aposmm>`. + +Set :ref:`exit_criteria` so libEnsemble knows +when to complete, and :ref:`persis_info` for +random sampling seeding: + +.. code-block:: python + :linenos: + + exit_criteria = {'sim_max': 2000} + persis_info = add_unique_random_streams({}, nworkers + 1) + +Finally, add statements to :doc:`initiate libEnsemble<../libe_module>`, and quickly +check calculated minima: + +.. code-block:: python + :linenos: + + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, + alloc_specs, libE_specs) + if is_master: + print('Minima:', H[np.where(H['local_min'])]['x']) + +Final Setup, Run, and Output +---------------------------- + +If you haven't already, install SciPy so APOSMM can access the required +optimization method:: + + pip install scipy + +Finally, run this libEnsemble / APOSMM optimization routine with the following:: + + python my_first_aposmm.py --comms local --nworkers 4 + +Please note that one worker will be "persistent" for APOSMM for the duration of +the routine. + +After a couple seconds, the output should resemble the following:: + + [0] libensemble.libE (MANAGER_WARNING): + ******************************************************************************* + User generator script will be creating sim_id. + Take care to do this sequentially. + Also, any information given back for existing sim_id values will be overwritten! + So everything in gen_specs['out'] should be in gen_specs['in']! + ******************************************************************************* + + Minima: [[ 0.08993295 -0.71265804] + [ 1.70360676 -0.79614982] + [-1.70368421 0.79606073] + [-0.08988064 0.71270945] + [-1.60699361 -0.56859108] + [ 1.60713962 0.56869567]] + +The first section labeled ``MANAGER_WARNING`` is a default libEnsemble warning +for generator functions that create ``sim_id``'s, like APOSMM. It does not +indicate a failure. + +The local minima for the Six-Hump Camel simulation function as evaluated by +APOSMM with libEnsemble should be listed directly below the warning. + +Please see the API reference :doc:`here<../examples/aposmm>` for +more APOSMM configuration options and other information. + +Applications +------------ + +APOSMM is not limited to evaluating minima from pure Python simulation functions. +Many common libEnsemble use-cases involve using +libEnsemble's :doc:`MPI Executor<../executor/overview>` to launch user +applications with parameters requested by APOSMM, then evaluate their output using +APOSMM, and repeat until minima are identified. A currently supported example +can be found in libEnsemble's `WarpX Scaling Test`_. + +.. _`Six-Hump Camel function`: https://www.sfu.ca/~ssurjano/camel6.html +.. _NLopt: https://nlopt.readthedocs.io/en/latest/ +.. _`PETSc/TAO`: https://www.mcs.anl.gov/petsc/ +.. _SciPy: https://www.scipy.org/scipylib/index.html +.. _`WarpX Scaling Test`: https://github.com/Libensemble/libensemble/tree/master/libensemble/tests/scaling_tests/warpx diff --git a/docs/tutorials/executor_forces_tutorial.rst b/docs/tutorials/executor_forces_tutorial.rst index c1d5e1184..a97924cfa 100644 --- a/docs/tutorials/executor_forces_tutorial.rst +++ b/docs/tutorials/executor_forces_tutorial.rst @@ -296,6 +296,30 @@ Load output data from our task and return to the libEnsemble manager: return output, persis_info, calc_status +This completes our ``sim_f`` and calling script. Run libEnsemble with: + +.. code-block:: bash + + $ python my_calling_script.py --comms local --nworkers 4 + +This may take about a minute to complete. Output should appear in a new +directory ``./ensemble``, with sub-directories labeled by ``sim_id`` and worker. + +The following optional lines parse and display some output: + +.. code-block:: python + :linenos: + + import os + + for dir in os.listdir('./ensemble'): + with open(os.path.join('./ensemble', dir, 'out.txt')) as f: + out = f.readlines() + print(dir + ':') + for line in out: + print(line) + print('-'*60) + Executor Variants ----------------- diff --git a/docs/tutorials/local_sine_tutorial.rst b/docs/tutorials/local_sine_tutorial.rst index 246d23dac..6c3614366 100644 --- a/docs/tutorials/local_sine_tutorial.rst +++ b/docs/tutorials/local_sine_tutorial.rst @@ -10,7 +10,7 @@ The foundation of writing libEnsemble routines is accounting for four components 1. The generator function :ref:`gen_f`, which produces values for simulations 2. The simulator function :ref:`sim_f`, which performs simulations based on values from ``gen_f`` - 3. The allocation function :ref:`alloc_f`, which decides which of the previous two functions should be called when + 3. The allocation function :ref:`alloc_f`, which decides which of the previous two functions should be called 4. The calling script, which defines parameters and information about these functions and the libEnsemble task, then begins execution libEnsemble initializes a *manager* process and as many *worker* processes as the diff --git a/docs/tutorials/tutorials.rst b/docs/tutorials/tutorials.rst index 2db82ad14..efa5ccf91 100644 --- a/docs/tutorials/tutorials.rst +++ b/docs/tutorials/tutorials.rst @@ -5,3 +5,4 @@ Tutorials local_sine_tutorial executor_forces_tutorial + aposmm_tutorial diff --git a/docs/utilities.rst b/docs/utilities.rst index 00dcdb210..c91311a23 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -10,6 +10,7 @@ Calling Script Function Support Generator Function Support -------------------------- +.. _p_gen_routines: These routines are commonly used within persistent generator functions like ``persistent_aposmm`` in ``libensemble/gen_funcs/`` for intermediate diff --git a/examples/libE_submission_scripts/bridges_submit_slurm_central.sh b/examples/libE_submission_scripts/bridges_submit_slurm_central.sh index ba1ee7062..d00f3d041 100644 --- a/examples/libE_submission_scripts/bridges_submit_slurm_central.sh +++ b/examples/libE_submission_scripts/bridges_submit_slurm_central.sh @@ -16,7 +16,7 @@ export EXE=libE_calling_script.py export NUM_WORKERS=4 -mpirun -np $(($NUM_WORKERS+1)) python $EXE +mpirun -np $(($NUM_WORKERS+1)) -ppn $(($NUM_WORKERS+1)) python $EXE # To use local mode instead of mpi4py (with parse_args()) # python $EXE --comms local --nworkers $NUM_WORKERS diff --git a/examples/libE_submission_scripts/theta_submit_balsam.sh b/examples/libE_submission_scripts/theta_submit_balsam.sh index 5351bcb8a..c4100a888 100644 --- a/examples/libE_submission_scripts/theta_submit_balsam.sh +++ b/examples/libE_submission_scripts/theta_submit_balsam.sh @@ -1,10 +1,9 @@ #!/bin/bash -x #COBALT -t 30 #COBALT -O libE_MPI_balsam -#COBALT -n 5 # No. nodes -#COBALT -q debug-flat-quad # Up to 8 nodes only -##COBALT -q default # For large jobs >=128 nodes -##COBALT -A +#COBALT -n 5 +#COBALT -q debug-flat-quad # Up to 8 nodes only # Use default for >=128 nodes +#COBALT -A # Script to launch libEnsemble using Balsam. # Assumes Conda environment is set up. @@ -14,16 +13,15 @@ # - Manager and workers run on one node (or a dedicated set of nodes). # - Workers submit tasks to the rest of the nodes in the pool. -# Constaint: - As set up - only uses one node (up to 63 workers) for libE. -# To use more, modifiy "balsam job" line to use hyper-threading -# and/or more than one node for libE. - # Name of calling script export EXE=libE_calling_script.py # Number of workers. export NUM_WORKERS=4 +# Number of nodes to run libE +export LIBE_NODES=1 + # Balsam wall-clock in minutes - make few mins smaller than batch wallclock export BALSAM_WALLCLOCK=25 @@ -37,44 +35,42 @@ export LIBE_WALLCLOCK=$(($BALSAM_WALLCLOCK-3)) # libEnsemble calling script arguments (some alternatives shown) # No args. All defined in calling script -# export SCRIPT_ARGS='' +export SCRIPT_ARGS='' -# If calling script is using parse_args() -# export SCRIPT_ARGS="--comms mpi --nworkers $NUM_WORKERS - -# If calling script takes wall-clock as positional arg and uses parse_args() -export SCRIPT_ARGS="$LIBE_WALLCLOCK --comms mpi --nworkers $NUM_WORKERS" +# If calling script takes wall-clock as positional argument. +# export SCRIPT_ARGS="$LIBE_WALLCLOCK" # Name of Conda environment export CONDA_ENV_NAME= # Name of database -export DBASE_NAME= # default - to use default database. - -# Conda location - theta specific -export PATH=/opt/intel/python/2017.0.035/intelpython35/bin:$PATH -export LD_LIBRARY_PATH=~/.conda/envs/$CONDA_ENV_NAME/lib:$LD_LIBRARY_PATH +export BALSAM_DB_NAME= # default - to use default database. -export PYTHONNOUSERSITE=1 # Ensure environment isolated - -export PMI_NO_FORK=1 # Required for python kills on Theta - -# Activate conda environment -. activate $CONDA_ENV_NAME +# Required for killing tasks from workers on Theta +export PMI_NO_FORK=1 # Unload Theta modules that may interfere with job monitoring/kills module unload trackdeps module unload darshan module unload xalt +# Obtain Conda PATH from miniconda-3/latest module +CONDA_DIR=/soft/datascience/conda/miniconda3/latest/bin + +# Ensure environment isolated +export PYTHONNOUSERSITE=1 + +# Activate conda environment +source $CONDA_DIR/activate $CONDA_ENV_NAME + # Activate Balsam database -. balsamactivate $DBASE_NAME +source balsamactivate $BALSAM_DB_NAME # Currently need atleast one DB connection per worker (for postgres). -if [[ $NUM_WORKERS -gt 128 ]] +if [[ $NUM_WORKERS -gt 100 ]] then #Add a margin - echo -e "max_connections=$(($NUM_WORKERS+10)) #Appended by submission script" \ + echo -e "max_connections=$(($NUM_WORKERS+20)) #Appended by submission script" \ >> $BALSAM_DB_PATH/balsamdb/postgresql.conf fi wait @@ -86,29 +82,33 @@ wait sleep 3 # Add calling script to Balsam database as app and job. -THIS_DIR=$PWD -SCRIPT_BASENAME=${EXE%.*} +export THIS_DIR=$PWD +export SCRIPT_BASENAME=${EXE%.*} + +# Multiple nodes +export LIBE_PROCS=$((NUM_WORKERS+1)) # Manager and workers +export PROCS_PER_NODE=$((LIBE_PROCS/LIBE_NODES)) # Must divide evenly balsam app --name $SCRIPT_BASENAME.app --exec $EXE --desc "Run $SCRIPT_BASENAME" -# Running libE on one node - one manager and upto 63 workers balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME \ - --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS \ - --wall-time-minutes $BALSAM_WALLCLOCK \ - --num-nodes 1 --ranks-per-node $((NUM_WORKERS+1)) \ - --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" \ - --url-in="local:/$THIS_DIR/*" --yes + --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS \ + --wall-time-minutes $LIBE_WALLCLOCK \ + --num-nodes $LIBE_NODES --ranks-per-node $PROCS_PER_NODE \ + --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" \ + --url-in="local:/$THIS_DIR/*" --yes # Hyper-thread libE (note this will not affect HT status of user calcs - only libE itself) -# Running 255 workers and one manager on one libE node. -# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $BALSAM_WALLCLOCK --num-nodes 1 --ranks-per-node 256 --threads-per-core 4 --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" --url-in="local:/$THIS_DIR/*" --yes - -# Multiple nodes for libE -# Running 127 workers and one manager - launch script on 129 nodes (if one node per worker) -# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $BALSAM_WALLCLOCK --num-nodes 2 --ranks-per-node 64 --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" --url-in="local:/$THIS_DIR/*" --yes +# E.g. Running 255 workers and one manager on one libE node. +# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME \ +# --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS \ +# --wall-time-minutes $LIBE_WALLCLOCK \ +# --num-nodes 1 --ranks-per-node 256 --threads-per-core 4 \ +# --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" \ +# --url-in="local:/$THIS_DIR/*" --yes # Run job balsam launcher --consume-all --job-mode=mpi --num-transition-threads=1 -# Deactivate Balsam database -. balsamdeactivate +wait +source balsamdeactivate diff --git a/examples/misc/ps_nodes.sh b/examples/misc/ps_nodes.sh new file mode 100755 index 000000000..108d83a7b --- /dev/null +++ b/examples/misc/ps_nodes.sh @@ -0,0 +1,29 @@ +##!/bin/bash + +# If a prosgres process is running, ssh to node and kill process +export uname=$USER +export appname='postgres \-D' + +# Check 6 login nodes +for i in {1..6} +do + hname=thetalogin$i + if [[ "$HOSTNAME" = $hname ]] + then + hostname; ps aux|grep $uname|grep "$appname" + else + ssh $hname "hostname; ps aux|grep $uname|grep '$appname'" + fi +done + +# Check 3 MOM nodes +for i in {1..3} +do + hname=thetamom$i + if [[ "$HOSTNAME" = $hname ]] + then + hostname; ps aux|grep $uname|grep '$appname' + else + ssh $hname "hostname; ps aux|grep $uname|grep '$appname'" + fi +done diff --git a/examples/tutorials/aposmm_tutorial_notebook.ipynb b/examples/tutorials/aposmm_tutorial_notebook.ipynb new file mode 100644 index 000000000..ba10f0a7d --- /dev/null +++ b/examples/tutorials/aposmm_tutorial_notebook.ipynb @@ -0,0 +1,327 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parallel Optimization with APOSMM\n", + "\n", + "This tutorial demonstrates libEnsemble’s capability to identify multiple minima of simulation output using the built-in APOSMM (Asynchronously Parallel Optimization Solver for finding Multiple Minima) ``gen_f``. In this tutorial, we’ll create a simple simulation ``sim_f`` that defines a function with multiple minima, then write a libEnsemble calling script that imports APOSMM and parameterizes it to check for minima over a domain of outputs from our ``sim_f``.\n", + "\n", + "Besides libEnsemble and NumPy, SciPy is also a required dependency.\n", + "\n", + "## Six-Hump Camel Simulation Function\n", + "\n", + "Describing APOSMM’s operations is simpler with a given function on which to depict evaluations. We’ll use the Six-Hump Camel function, known to have six global minima. A sample space of this function, containing all minima, appears below:\n", + "\n", + "![6humpcamel](images/basic_6hc.png)\n", + "\n", + "*Note: The following ``sim_f`` won't operate stand-alone since it has not yet been parameterized and called by libEnsemble. The full routine should work as expected.*\n", + "\n", + "Create a new Python file named ``tutorial_six_hump_camel.py``. This will be our ``sim_f``, incorporating the above function. Write the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def six_hump_camel(H, persis_info, sim_specs, _):\n", + " \"\"\"Six-Hump Camel sim_f.\"\"\"\n", + "\n", + " batch = len(H['x']) # Num evaluations each sim_f call.\n", + " H_o = np.zeros(batch, dtype=sim_specs['out']) # Define output array H\n", + "\n", + " for i, x in enumerate(H['x']):\n", + " H_o['f'][i] = three_hump_camel_func(x) # Function evaluations placed into H\n", + "\n", + " return H_o, persis_info\n", + "\n", + "\n", + "def six_hump_camel_func(x):\n", + " \"\"\" Six-Hump Camel function definition \"\"\"\n", + " x1 = x[0]\n", + " x2 = x[1]\n", + " term1 = (4-2.1*x1**2+(x1**4)/3) * x1**2\n", + " term2 = x1*x2\n", + " term3 = (-4+4*x2**2) * x2**2\n", + "\n", + " return term1 + term2 + term3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## APOSMM Operations\n", + "\n", + "APOSMM coordinates multiple local optimization runs starting from a collection of sample points. These local optimization runs occur in parallel, and can incorporate a variety of optimization methods, including from NLopt, PETSc/TAO, and SciPy. Some number of uniformly sampled points is returned by APOSMM for simulation evaluations before local optimization runs can occur, if no prior simulation evaluations are provided. User-requested sample points can also be provided to APOSMM:\n", + "\n", + "![6hcsampling](images/sampling_6hc.png)\n", + "\n", + "Specifically, APOSMM will begin local optimization runs from those points that don’t have better (more minimal) points nearby within a threshold ``r_k``. For the above example, after APOSMM has returned the uniformly sampled points, for simulation evaluations it will likely begin local optimization runs from the user-requested approximate minima. Providing these isn’t required, but can offer performance benefits.\n", + "\n", + "Each local optimization run chooses new points and determines if they’re better by passing them back to be evaluated by the simulation routine. If so, new local optimization runs are started from those points. This continues until each run converges to a minimum:\n", + "\n", + "![6hclocalopt](images/localopt_6hc.png)\n", + "\n", + "Throughout, generated and evaluated points are appended to the History array, with the field ``'local_pt'`` being ``True`` if the point is part of a local optimization run, and ``'local_min'`` being ``True`` if the point has been ruled a local minimum.\n", + "\n", + "## APOSMM Persistence\n", + "\n", + "The most recent version of APOSMM included with libEnsemble is referred to as Persistent APOSMM. Unlike most other user functions that are initiated and completed by workers multiple times based on allocation, a single worker process initiates APOSMM so that it “persists” and keeps running over the course of the entire libEnsemble routine. APOSMM begins it’s own parallel evaluations and communicates points back and forth with the manager, which are then given to workers and evaluated by simulation routines.\n", + "\n", + "In practice, since a single worker becomes “persistent” for APOSMM, users must ensure that enough workers or MPI ranks are initiated to support libEnsemble’s manager, a persistent worker to run APOSMM, and simulation routines. The following:\n", + "\n", + " mpiexec -n 3 python my_aposmm_routine.py\n", + " \n", + "results in only one worker process available to perform simulation routines.\n", + "\n", + "## Calling Script\n", + "\n", + "Create a new Python file named ``my_first_aposmm.py``. Start by importing NumPy, libEnsemble routines, APOSMM, our ``sim_f``, and a specialized allocation function:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from tutorial_six_hump_camel import six_hump_camel\n", + "\n", + "from libensemble.libE import libE\n", + "from libensemble.gen_funcs.persistent_aposmm import aposmm\n", + "from libensemble.alloc_funcs.persistent_aposmm_alloc import persistent_aposmm_alloc\n", + "from libensemble.tools import parse_args, add_unique_random_streams" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This allocation function starts a single Persistent APOSMM routine and provides ``sim_f`` output for points requested by APOSMM. Points can be sampled points or points from local optimization runs.\n", + "\n", + "APOSMM supports a wide variety of external optimizers. The following statements set optimizer settings to ``'scipy'`` to indicate to APOSMM which optimization method to use, and help prevent unnecessary imports or package installations:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import libensemble.gen_funcs\n", + "libensemble.gen_funcs.rc.aposmm_optimizers = 'scipy'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set up ``nworkers``, ``libE_specs``, ``sim_specs``, ``gen_specs``, and ``alloc_specs``:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "nworkers = 4\n", + "libE_specs = {'nworkers': 4, 'comms': 'local'}\n", + "\n", + "sim_specs = {'sim_f': six_hump_camel, # Simulation function\n", + " 'in': ['x'], # Accepts 'x' values\n", + " 'out': [('f', float)]} # Returns f(x) values\n", + "\n", + "gen_out = [('x', float, 2), # Produces 'x' values\n", + " ('x_on_cube', float, 2), # 'x' values scaled to unit cube\n", + " ('sim_id', int), # Produces sim_id's for History array indexing\n", + " ('local_min', bool), # Is a point a local minimum?\n", + " ('local_pt', bool)] # Is a point from a local opt run?\n", + "\n", + "gen_specs = {'gen_f': aposmm, # APOSMM generator function\n", + " 'in': [],\n", + " 'out': gen_out, # Output defined like above dict\n", + " 'user': {'initial_sample_size': 100, # Random sample 100 points to start\n", + " 'localopt_method': 'scipy_Nelder-Mead',\n", + " 'opt_return_codes': [0], # Return code specific to localopt_method\n", + " 'max_active_runs': 6, # Occur in parallel\n", + " 'lb': np.array([-2, -1]), # Lower bound of search domain\n", + " 'ub': np.array([2, 1])} # Upper bound of search domain\n", + " }\n", + "\n", + "alloc_specs = {'alloc_f': persistent_aposmm_alloc,\n", + " 'out': [('given_back', bool)], 'user': {}}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "``gen_specs['user']`` fields above that are required for APOSMM are:\n", + "\n", + "* ``'lb'`` - Search domain lower bound\n", + "* ``'ub'`` - Search domain upper bound\n", + "* ``'localopt_method'`` - Chosen local optimization method\n", + "* ``'initial_sample_size'`` - Number of uniformly sampled points generated\n", + " before local optimization runs.\n", + "* ``'opt_return_codes'`` - A list of integers that local optimization\n", + " methods return when a minimum is detected. SciPy's Nelder-Mead returns 0,\n", + " but other methods (not used in this tutorial) return 1.\n", + "\n", + "Also note the following:\n", + "\n", + "* ``gen_specs['in']`` is empty. For other ``gen_f``'s this defines what\n", + " fields to give to the ``gen_f`` when called, but here APOSMM's\n", + " ``alloc_f`` defines those fields.\n", + "* ``'x_on_cube'`` in ``gen_specs['out']``. APOSMM works internally on\n", + " ``'x'`` values scaled to the unit cube. To avoid back-and-forth scaling\n", + " issues, both types of ``'x'``'s are communicated back, even though the\n", + " simulation will likely use ``'x'`` values. (APOSMM performs handshake to\n", + " ensure that the ``x_on_cube`` that was given to be evaluated is the same\n", + " the one that is given back.)\n", + "* ``'sim_id'`` in ``gen_specs['out']``. APOSMM produces points in it's\n", + " local History array that it will need to update later, and can best\n", + " reference those points (and avoid a search) if APOSMM produces the IDs\n", + " itself, instead of libEnsemble.\n", + "\n", + "Other options and configurations can be found in the APOSMM [API reference](https://libensemble.readthedocs.io/en/master/examples/aposmm.html).\n", + "\n", + "Set ``exit_criteria`` so libEnsemble knows when to complete, and ``persis_info`` for random sampling seeding:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "exit_criteria = {'sim_max': 2000}\n", + "persis_info = add_unique_random_streams({}, nworkers + 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, add statements to initiate libEnsemble, and quickly check calculated minima:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[0] libensemble.libE (MANAGER_WARNING): \n", + "*******************************************************************************\n", + "User generator script will be creating sim_id.\n", + "Take care to do this sequentially.\n", + "Also, any information given back for existing sim_id values will be overwritten!\n", + "So everything in gen_specs['out'] should be in gen_specs['in']!\n", + "*******************************************************************************\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Minima: [[ 0.08993295 -0.71265804]\n", + " [ 1.70360676 -0.79614982]\n", + " [-1.70368421 0.79606073]\n", + " [-0.08988064 0.71270945]\n", + " [-1.60699361 -0.56859108]\n", + " [ 1.60713962 0.56869567]]\n" + ] + } + ], + "source": [ + "H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info,\n", + " alloc_specs, libE_specs)\n", + "\n", + "print('Minima:', H[np.where(H['local_min'])]['x'])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output\n", + "\n", + "Please note that one worker will be “persistent” for APOSMM for the duration of the routine.\n", + "\n", + "After a couple seconds, the above output should resemble the following:\n", + "\n", + " [0] libensemble.libE (MANAGER_WARNING):\n", + " *******************************************************************************\n", + " User generator script will be creating sim_id.\n", + " Take care to do this sequentially.\n", + " Also, any information given back for existing sim_id values will be overwritten!\n", + " So everything in gen_specs['out'] should be in gen_specs['in']!\n", + " *******************************************************************************\n", + "\n", + "\n", + " Minima: [[ 0.08993295 -0.71265804]\n", + " [ 1.70360676 -0.79614982]\n", + " [-1.70368421 0.79606073]\n", + " [-0.08988064 0.71270945]\n", + " [-1.60699361 -0.56859108]\n", + " [ 1.60713962 0.56869567]]\n", + " \n", + "The first section labeled ``MANAGER_WARNING`` is a default libEnsemble warning for generator functions that create ``sim_id``’s, like APOSMM. It does not indicate a failure.\n", + "\n", + "The local minima for the Six-Hump Camel simulation function as evaluated by APOSMM with libEnsemble should be listed directly below the warning.\n", + "\n", + "Please see the [API reference](https://libensemble.readthedocs.io/en/master/examples/aposmm.html) for more APOSMM configuration options and other information.\n", + "\n", + "## Applications\n", + "\n", + "APOSMM is not limited to evaluating minima from pure Python simulation functions.\n", + "Many common libEnsemble use-cases involve using libEnsemble's Executor to launch user\n", + "applications with parameters requested by APOSMM, then evaluate their output using\n", + "APOSMM, and repeat until minima are identified. A currently supported example\n", + "can be found in libEnsemble's [WarpX Scaling Test](https://github.com/Libensemble/libensemble/tree/master/libensemble/tests/scaling_tests/warpx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/forces_tutorial_notebook.ipynb b/examples/tutorials/forces_tutorial_notebook.ipynb new file mode 100644 index 000000000..6e7593d96 --- /dev/null +++ b/examples/tutorials/forces_tutorial_notebook.ipynb @@ -0,0 +1,403 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Executor with Electrostatic Forces\n", + "----------------------------------------\n", + "\n", + "This tutorial highlights libEnsemble’s capability to execute and monitor external scripts or user applications within simulation or generator functions using the executor. In this tutorial, our calling script registers an external C executable that simulates electrostatic forces between a collection of particles. The ``sim_f`` routine then launches and polls this executable.\n", + "\n", + "It is possible to use ``subprocess`` calls from Python to issue commands such as ``jsrun`` or ``aprun`` to run applications. Unfortunately, hard-coding such commands within user scripts isn’t portable. Furthermore, many systems like Argonne’s Theta do not allow libEnsemble to submit additional tasks from the compute nodes. On these systems a proxy launch mechanism (such as Balsam) is required. libEnsemble’s Executor was developed to directly address such issues.\n", + "\n", + "Getting Started\n", + "------------------\n", + "\n", + "**An MPI distribution and ``mpi4py`` are required to use this notebook locally**.\n", + "\n", + "A simulation source file ``forces.c`` is available in the libEnsemble repository for compiling into an application ``forces.x``. This app will be registered by the Executor for launching by libEnsemble's workers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "import requests\n", + "\n", + "url = 'https://raw.githubusercontent.com/Libensemble/libensemble/master/libensemble/tests/scaling_tests/forces/forces.c'\n", + "forces = requests.get(url)\n", + "open('./forces.c', 'wb').write(forces.content)\n", + "\n", + "subprocess.run('mpicc -O3 -o forces.x forces.c -lm'.split())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Simulation Function\n", + "-----------------------\n", + "\n", + "__*Note: Several of these stand-alone Simulation Function code-cells may not execute properly since the calling script with all expected parameters isn't defined until later. The complete libEnsemble routine should still function as expected.*__\n", + "\n", + "Our ``sim_f`` is where we’ll use libEnsemble’s executor to configure and submit for execution our compiled simulation code. We will poll this task’s state while it runs, and once we’ve detected it has finished we will send any results or exit statuses back to the manager.\n", + "\n", + "Create a Python file named ``tutorial_forces_simf.py`` containing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "import numpy as np\n", + "\n", + "from libensemble.executors.executor import Executor\n", + "from libensemble.message_numbers import WORKER_DONE, WORKER_KILL, TASK_FAILED\n", + "\n", + "MAX_SEED = 32767\n", + "\n", + "def perturb(particles, seed, max_fraction):\n", + " \"\"\"Modify particle count\"\"\"\n", + " seed_fraction = seed/MAX_SEED\n", + " max_delta = particles * max_fraction\n", + " delta = seed_fraction * max_delta\n", + " delta = delta - max_delta/2 # translate so -/+\n", + " new_particles = particles + delta\n", + " return int(new_particles)\n", + "\n", + "def read_last_line(filepath):\n", + " \"\"\"Read last line of statfile\"\"\"\n", + " try:\n", + " with open(filepath, 'rb') as fh:\n", + " line = fh.readlines()[-1].decode().rstrip()\n", + " except Exception:\n", + " line = \"\" # In case file is empty or not yet created\n", + " return line" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use libEnsemble’s message number tags to communicate the worker’s status to the manager. For testing purposes, the ``perturb()`` function randomizes the resources used for each calculation. The second function parses forces values and statuses in the ``.stat`` file produced by our compiled code. Now we can write the actual ``sim_f``. We’ll first write the function definition, extract our parameters from ``sim_specs``, define a random seed, and use ``perturb()`` to randomize our particle counts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def run_forces(H, persis_info, sim_specs, libE_info):\n", + " calc_status = 0\n", + "\n", + " x = H['x']\n", + " sim_particles = sim_specs['user']['sim_particles']\n", + " sim_timesteps = sim_specs['user']['sim_timesteps']\n", + " time_limit = sim_specs['user']['sim_kill_minutes'] * 60.0\n", + "\n", + " cores = sim_specs['user'].get('cores', None)\n", + " kill_rate = sim_specs['user'].get('kill_rate', 0)\n", + " particle_variance = sim_specs['user'].get('particle_variance', 0)\n", + "\n", + " seed = int(np.rint(x[0][0]))\n", + "\n", + " # To give a random variance of work-load\n", + " sim_particles = perturb(sim_particles, seed, particle_variance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we will instantiate our executor and submit our registered application for execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " # Use pre-defined executor object\n", + " exctr = Executor.executor\n", + "\n", + " # Arguments for our registered simulation\n", + " args = str(int(sim_particles)) + ' ' + str(sim_timesteps) + ' ' + str(seed) + ' ' + str(kill_rate)\n", + "\n", + " # Submit our simulation for execution.\n", + " if cores:\n", + " task = exctr.submit(calc_type='sim', num_procs=cores, app_args=args,\n", + " stdout='out.txt', stderr='err.txt', wait_on_run=True)\n", + " else:\n", + " task = exctr.submit(calc_type='sim', app_args=args, stdout='out.txt',\n", + " stderr='err.txt', wait_on_run=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In each executor ``submit()`` routine, we define the type of calculation being performed, optionally the number of processors to run the task on, additional arguments for the simulation code, and files for ``stdout`` and ``stderr`` output. The ``wait_on_run`` argument pauses sim_f execution until the task is confirmed to be running. See the docs for more information about these and other options.\n", + "\n", + "The rest of our ``sim_f`` polls the task’s dynamically updated attributes for its status, determines if a successful run occurred after the task completes, then formats and returns the output data to the manager.\n", + "\n", + "We can poll the task and kill it in certain circumstances:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " # Stat file to check for bad runs\n", + " statfile = 'forces.stat'\n", + " filepath = os.path.join(task.workdir, statfile)\n", + " line = None\n", + "\n", + " poll_interval = 1\n", + " while not task.finished :\n", + " line = read_last_line(filepath) # Parse some output from the task\n", + " if line == \"kill\":\n", + " task.kill()\n", + " elif task.runtime > time_limit:\n", + " task.kill()\n", + " else:\n", + " time.sleep(poll_interval)\n", + " task.poll() # updates the task's attributes\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once our task finishes, adjust ``calc_status`` (our “exit code”) and report to the user based on the task’s final state:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " if task.finished:\n", + " if task.state == 'FINISHED':\n", + " print(\"Task {} completed\".format(task.name))\n", + " calc_status = WORKER_DONE\n", + " if read_last_line(filepath) == \"kill\":\n", + " print(\"Warning: Task complete but marked bad (kill flag in forces.stat)\")\n", + " elif task.state == 'FAILED':\n", + " print(\"Warning: Task {} failed: Error code {}\".format(task.name, task.errcode))\n", + " calc_status = TASK_FAILED\n", + " elif task.state == 'USER_KILLED':\n", + " print(\"Warning: Task {} has been killed\".format(task.name))\n", + " calc_status = WORKER_KILL\n", + " else:\n", + " print(\"Warning: Task {} in unknown state {}. Error code {}\".format(task.name, task.state, task.errcode))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load output data from our task and return to the libEnsemble manager:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " time.sleep(0.2) # Small buffer to guarantee data has been written\n", + " try:\n", + " data = np.loadtxt(filepath)\n", + " final_energy = data[-1]\n", + " except Exception:\n", + " final_energy = np.nan\n", + "\n", + " outspecs = sim_specs['out']\n", + " output = np.zeros(1, dtype=outspecs)\n", + " output['energy'][0] = final_energy\n", + "\n", + " return output, persis_info, calc_status\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calling Script\n", + "----------------\n", + "\n", + "Finally, lets write our calling script to parameterize our simulation and generation functions and call libEnsemble. Create a Python file containing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python\n", + "import os\n", + "import numpy as np\n", + "from tutorial_forces_simf import run_forces # Sim func from current dir\n", + "\n", + "from libensemble.libE import libE\n", + "from libensemble.gen_funcs.sampling import uniform_random_sample\n", + "from libensemble.tools import parse_args, add_unique_random_streams\n", + "from libensemble.executors.mpi_executor import MPIExecutor\n", + "\n", + "nworkers = 4\n", + "libE_specs = {'nworkers': nworkers, 'comms': 'local'}\n", + "\n", + "# Create executor and register sim to it\n", + "exctr = MPIExecutor() # Use auto_resources=False to oversubscribe\n", + "\n", + "# Create empty simulation input directory\n", + "if not os.path.isdir('./sim'):\n", + " os.mkdir('./sim')\n", + "\n", + "# Register simulation executable with executor\n", + "sim_app = os.path.join(os.getcwd(), 'forces.x')\n", + "exctr.register_calc(full_path=sim_app, calc_type='sim')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On line 4 we import our not-yet-written ``sim_f``. We also import necessary libEnsemble components and a convenience function to initiate ``persis_info``.\n", + "\n", + "Next we define our executor class instance. This instance can be customized with many of the settings defined here. We’ll register our simulation with the executor and use the same instance within our ``sim_f``.\n", + "\n", + "libEnsemble can perform and write every simulation (within the ensemble) in a separate directory for organization and potential I/O benefits. In this example, libEnsemble copies a source directory and its contents to create these simulation directories. For our purposes, an empty directory ``./sim`` is sufficient.\n", + "\n", + "Next define the ``sim_specs`` and ``gen_specs`` data structures:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# State the sim_f, its arguments, output, and parameters (and their sizes)\n", + "sim_specs = {'sim_f': run_forces, # sim_f, imported above\n", + " 'in': ['x'], # Name of input for sim_f\n", + " 'out': [('energy', float)], # Name, type of output from sim_f\n", + " 'user': {'simdir_basename': 'forces', # User parameters for sim_f\n", + " 'cores': 1,\n", + " 'sim_particles': 1e3,\n", + " 'sim_timesteps': 5,\n", + " 'sim_kill_minutes': 10.0,\n", + " 'particle_variance': 0.2,\n", + " 'kill_rate': 0.5}\n", + " }\n", + "\n", + "# State the gen_f, its arguments, output, and necessary parameters.\n", + "gen_specs = {'gen_f': uniform_random_sample, # Generator function\n", + " 'in': ['sim_id'], # Generator input\n", + " 'out': [('x', float, (1,))], # Name, type and size of data from gen_f\n", + " 'user': {'lb': np.array([0]), # User parameters for gen_f\n", + " 'ub': np.array([32767]),\n", + " 'gen_batch_size': 1000,\n", + " 'batch_mode': True,\n", + " 'num_active_gens': 1,\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These dictionaries configure our generation function ``gen_f`` and our simulation function ``sim_f``, respectively, as the ``uniform_random_sample`` and ``run_forces`` functions. Our ``gen_f`` will generate random seeds when initializing each ``sim_f`` call.\n", + "\n", + "After some additions to ``libE_specs`` and defining our ``exit_criteria`` and ``persis_info``, our script calls the main libE routine:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "libE_specs['save_every_k_gens'] = 1000 # Save every K steps\n", + "libE_specs['sim_input_dir'] = './sim' # Sim dir to be copied for each worker\n", + "\n", + "exit_criteria = {'sim_max': 8}\n", + "\n", + "persis_info = add_unique_random_streams({}, nworkers + 1)\n", + "\n", + "H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria,\n", + " persis_info=persis_info, libE_specs=libE_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This may take about a minute to complete. Output should appear in a new\n", + "directory ``./ensemble``, with sub-directories labeled by ``sim_id`` and worker.\n", + "\n", + "The following lines parse and display some output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "for dir in os.listdir('./ensemble'):\n", + " with open(os.path.join('./ensemble', dir, 'out.txt')) as f:\n", + " out = f.readlines()\n", + " print(dir + ':')\n", + " for line in out:\n", + " print(line)\n", + " print('-'*60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/images/basic_6hc.png b/examples/tutorials/images/basic_6hc.png new file mode 120000 index 000000000..760f245db --- /dev/null +++ b/examples/tutorials/images/basic_6hc.png @@ -0,0 +1 @@ +../../../docs/images/basic_6hc.png \ No newline at end of file diff --git a/examples/tutorials/images/localopt_6hc.png b/examples/tutorials/images/localopt_6hc.png new file mode 120000 index 000000000..ad6a8415a --- /dev/null +++ b/examples/tutorials/images/localopt_6hc.png @@ -0,0 +1 @@ +../../../docs/images/localopt_6hc.png \ No newline at end of file diff --git a/examples/tutorials/images/sampling_6hc.png b/examples/tutorials/images/sampling_6hc.png new file mode 120000 index 000000000..ceeb5c24e --- /dev/null +++ b/examples/tutorials/images/sampling_6hc.png @@ -0,0 +1 @@ +../../../docs/images/sampling_6hc.png \ No newline at end of file diff --git a/examples/tutorials/sine_tutorial_notebook.ipynb b/examples/tutorials/sine_tutorial_notebook.ipynb index 645087e88..a9ac5ac52 100644 --- a/examples/tutorials/sine_tutorial_notebook.ipynb +++ b/examples/tutorials/sine_tutorial_notebook.ipynb @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -105,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -141,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ diff --git a/examples/tutorials/tutorial_aposmm.py b/examples/tutorials/tutorial_aposmm.py new file mode 100644 index 000000000..93c59ebbd --- /dev/null +++ b/examples/tutorials/tutorial_aposmm.py @@ -0,0 +1,45 @@ +import numpy as np + +from tutorial_six_hump_camel import six_hump_camel + +from libensemble.libE import libE +from libensemble.gen_funcs.persistent_aposmm import aposmm +from libensemble.alloc_funcs.persistent_aposmm_alloc import persistent_aposmm_alloc +from libensemble.tools import parse_args, add_unique_random_streams + +import libensemble.gen_funcs +libensemble.gen_funcs.rc.aposmm_optimizers = 'scipy' + +nworkers, is_master, libE_specs, _ = parse_args() + +sim_specs = {'sim_f': six_hump_camel, # Simulation function + 'in': ['x'], # Accepts 'x' values + 'out': [('f', float)]} # Returns f(x) values + +gen_out = [('x', float, 2), # Produces 'x' values + ('x_on_cube', float, 2), # 'x' values scaled to unit cube + ('sim_id', int), # Produces IDs for sim order + ('local_min', bool), # Is a point a local minimum? + ('local_pt', bool)] # Is a point from a local opt run? + +gen_specs = {'gen_f': aposmm, # APOSMM generator function + 'in': [], + 'out': gen_out, # Output defined like above dict + 'user': {'initial_sample_size': 100, # Random sample 100 points to start + 'localopt_method': 'scipy_Nelder-Mead', + 'opt_return_codes': [0], # Return code specific to localopt_method + 'max_active_runs': 6, # Occur in parallel + 'lb': np.array([-2, -1]), # Lower bound of search domain + 'ub': np.array([2, 1])} # Upper bound of search domain + } + +alloc_specs = {'alloc_f': persistent_aposmm_alloc, + 'out': [('given_back', bool)], 'user': {}} + +exit_criteria = {'sim_max': 2000} +persis_info = add_unique_random_streams({}, nworkers + 1) + +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, + alloc_specs, libE_specs) +if is_master: + print('Minima:', H[np.where(H['local_min'])]['x']) diff --git a/examples/tutorials/tutorial_six_hump_camel.py b/examples/tutorials/tutorial_six_hump_camel.py new file mode 100644 index 000000000..96b9d8c81 --- /dev/null +++ b/examples/tutorials/tutorial_six_hump_camel.py @@ -0,0 +1,24 @@ +import numpy as np + + +def six_hump_camel(H, persis_info, sim_specs, _): + """Six-Hump Camel sim_f.""" + + batch = len(H['x']) # Num evaluations each sim_f call. + H_o = np.zeros(batch, dtype=sim_specs['out']) # Define output array H + + for i, x in enumerate(H['x']): + H_o['f'][i] = six_hump_camel_func(x) # Function evaluations placed into H + + return H_o, persis_info + + +def six_hump_camel_func(x): + """ Six-Hump Camel function definition """ + x1 = x[0] + x2 = x[1] + term1 = (4-2.1*x1**2+(x1**4)/3) * x1**2 + term2 = x1*x2 + term3 = (-4+4*x2**2) * x2**2 + + return term1 + term2 + term3 diff --git a/install/run_travis_locally/build_mpich_libE.sh b/install/run_travis_locally/build_mpich_libE.sh index 27f8a6679..2550e3188 100755 --- a/install/run_travis_locally/build_mpich_libE.sh +++ b/install/run_travis_locally/build_mpich_libE.sh @@ -84,12 +84,16 @@ pip install coveralls || return git clone -b $LIBE_BRANCH https://github.com/Libensemble/libensemble.git || return cd libensemble/ || return pip install -e . || return + +wget https://github.com/balsam-alcf/balsam/archive/0.3.8.tar.gz +mkdir ../balsam; tar xf 0.3.8.tar.gz -C ../balsam python install/configure_balsam_install.py export BALSAM_DB_PATH=~/test-balsam ulimit -Sn 10000 if [ "$RUN_TESTS" = true ]; then ./libensemble/tests/run-tests.sh -z +fi echo -e "\n\nScript completed...\n\n" set +ex diff --git a/install/run_travis_locally/quick_run.md b/install/run_travis_locally/quick_run.md new file mode 100644 index 000000000..bb935b033 --- /dev/null +++ b/install/run_travis_locally/quick_run.md @@ -0,0 +1,41 @@ +# Quick run + +If you have followed the ``readme`` to obtain ``docker`` and set up a container, this gives a quick example of running it. + +Note: In this examnple the tests are not automatically run (-i in option to build_mpich_libE.sh). + +For details see the ``readme``. + +Two windows: +Window 1: In install/run_travis_locally direcory. +Window 2: Will create and run container. + +Windows 1 and 2 - name container. E.g: + + export CONTAINER=travis-debug-2020-07-20-py3.5 + +Window 2: + + sudo docker run --name $CONTAINER -dit travisci/ci-garnet:packer-1512502276-986baf0 /sbin/init + sudo docker exec -it $CONTAINER bash -l + +Window 1: + + docker cp build_mpich_libE.sh $CONTAINER:/home/travis + +Window 1 Optional - user scripts to help navigate: + + docker cp ~/.bashrc $CONTAINER:/home/travis + docker cp ~/.alias $CONTAINER:/home/travis + +WWindow 2 (Example: Do not run tests python 3.5 - git branch feature/register_apps): + + chown travis:travis /home/travis/build_mpich_libE.sh + su - travis + . ./build_mpich_libE.sh -p 3.5 -b feature/register_apps -i + +Window 2 Optional - user scripts to help navigate: + + . ~/.bashrc + +Note: libEnsemble will be git cloned and checked out at the given branch. diff --git a/install/test_balsam_hworld.py b/install/test_balsam_hworld.py index 79f1bfbac..d927f547f 100644 --- a/install/test_balsam_hworld.py +++ b/install/test_balsam_hworld.py @@ -20,46 +20,66 @@ def run_Balsam_job(): def wait_for_job_dir(basedb): sleeptime = 0 + limit = 15 - while not os.path.isdir(basedb) and sleeptime < 15: + # Stop sleeping once database directory detected + print('Waiting for Balsam Database directory.') + while sleeptime < limit: + if os.path.isdir(basedb): + break time.sleep(1) sleeptime += 1 + assert sleeptime < limit, \ + "Balsam Database directory not created within {} seconds.".format(limit) + + # Stop sleeping once job directory detected within database directory print('Waiting for Job Directory {}'.format(sleeptime)) - while len(os.listdir(basedb)) == 0 and sleeptime < 15: + while sleeptime < limit: + if len(os.listdir(basedb)) > 0: + break print(sleeptime, end=" ", flush=True) time.sleep(1) sleeptime += 1 - jobdirname = os.listdir(basedb)[0] - jobdir = os.path.join(basedb, jobdirname) + assert sleeptime < limit, \ + "Balsam Job directory not created within {} seconds.".format(limit) + + # Assumes database dir was empty, now contains single job dir + jobdir = os.path.join(basedb, os.listdir(basedb)[0]) return jobdir def wait_for_job_output(jobdir): sleeptime = 0 + limit = 40 output = os.path.join(jobdir, 'job_script_test_balsam_hworld.out') print('Checking for Balsam output file: {}'.format(output)) - while not os.path.isfile(output) and sleeptime < 40: + while sleeptime < limit: + if os.path.isfile(output): + break print(sleeptime, end=" ", flush=True) time.sleep(1) sleeptime += 1 + assert sleeptime < limit, \ + "Balsam output file not created within {} seconds.".format(limit) + return output def print_job_output(outscript): sleeptime = 0 + limit = 80 - print('Output file found. Waiting for complete Balsam Job Output.') - lastlines = ['Job 4 done on worker 1\n', 'Job 4 done on worker 2\n', - 'Run completed.\n'] + print('Blank output file found. Waiting for expected complete Balsam Job Output.') + succeed_line = 'Received: [34 34 31 31 34 34 32 32 33 33 0 0]\n' lastposition = 0 - while sleeptime < 60: + while sleeptime < limit: with open(outscript, 'r') as f: f.seek(lastposition) new = f.read() @@ -70,12 +90,16 @@ def print_job_output(outscript): else: print(sleeptime, end=" ", flush=True) - if any(new.endswith(line) for line in lastlines): + if succeed_line in new: + print('Success. Received task statuses match expected.') break time.sleep(1) sleeptime += 1 + assert sleeptime < limit, \ + "Expected Balsam Job output-file contents not detected after {} seconds.".format(limit) + def move_job_coverage(jobdir): # Move coverage files from Balsam DB to ./regression_tests (for concatenation) diff --git a/libensemble/__init__.py b/libensemble/__init__.py index c964ccc63..7740392a5 100644 --- a/libensemble/__init__.py +++ b/libensemble/__init__.py @@ -4,7 +4,7 @@ Library to coordinate the concurrent evaluation of dynamic ensembles of calculations. """ -__version__ = "0.7.0" +__version__ = "0.7.1" __author__ = 'Jeffrey Larson, Stephen Hudson, Stefan M. Wild, David Bindel and John-Luke Navarro' __credits__ = 'Argonne National Laboratory' diff --git a/libensemble/alloc_funcs/persistent_aposmm_alloc.py b/libensemble/alloc_funcs/persistent_aposmm_alloc.py index c93dde521..b985c1725 100644 --- a/libensemble/alloc_funcs/persistent_aposmm_alloc.py +++ b/libensemble/alloc_funcs/persistent_aposmm_alloc.py @@ -1,6 +1,6 @@ import numpy as np -from libensemble.tools.alloc_support import avail_worker_ids, sim_work, gen_work +from libensemble.tools.alloc_support import avail_worker_ids, sim_work, gen_work, count_persis_gens def persistent_aposmm_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info): @@ -18,6 +18,8 @@ def persistent_aposmm_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info """ Work = {} + gen_count = count_persis_gens(W) + if persis_info.get('first_call', True): assert np.all(H['given']), "Initial points in H have never been given." assert np.all(H['given_back']), "Initial points in H have never been given_back." @@ -33,6 +35,9 @@ def persistent_aposmm_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info persis_info['samples_in_H0'] = sum(H['local_pt'] == 0) persis_info['next_to_give'] = len(H) # persis_info['first_call'] = False + elif gen_count == 0: + # The one persistent gen is done. Exiting + return Work, persis_info, 1 # If any persistent worker's calculated values have returned, give them back. for i in avail_worker_ids(W, persistent=True): diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index 4329a9cd0..5de788a8c 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -17,7 +17,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info): Work = {} gen_count = count_persis_gens(W) - if len(H) and gen_count == 0: + if persis_info.get('gen_started') and gen_count == 0: # The one persistent worker is done. Exiting return Work, persis_info, 1 @@ -45,7 +45,8 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info): elif gen_count == 0: # Finally, call a persistent generator as there is nothing else to do. gen_count += 1 - gen_work(Work, i, gen_specs['in'], [], persis_info[i], + gen_work(Work, i, gen_specs['in'], range(len(H)), persis_info[i], persistent=True) + persis_info['gen_started'] = True return Work, persis_info, 0 diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index 7a41b2df2..a16a765d4 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -20,7 +20,7 @@ from libensemble.resources.mpi_resources import MPIResources from libensemble.executors.executor import \ - Task, ExecutorException, jassert, STATES + Application, Task, ExecutorException, TimeoutExpired, jassert, STATES from libensemble.executors.mpi_executor import MPIExecutor import balsam.launcher.dag as dag @@ -85,23 +85,17 @@ def calc_task_timing(self): if self.total_time is None: self.total_time = time.time() - self.submit_time - def poll(self): - """Polls and updates the status attributes of the supplied task""" - if not self.check_poll(): - return - - # Get current state of tasks from Balsam database - self.process.refresh_from_db() - balsam_state = self.process.state - self.runtime = self._get_time_since_balsam_submit() - - if balsam_state in models.END_STATES: - self.finished = True - self.calc_task_timing() + def _set_complete(self, dry_run=False): + """Set task as complete""" + self.finished = True + if dry_run: + self.success = True + self.state = 'FINISHED' + else: + balsam_state = self.process.state self.workdir = self.workdir or self.process.working_directory + self.calc_task_timing() self.success = (balsam_state == 'JOB_FINISHED') - # self.errcode - requested feature from Balsam devs - if balsam_state == 'JOB_FINISHED': self.state = 'FINISHED' elif balsam_state == 'PARENT_KILLED': # Not currently used @@ -116,6 +110,22 @@ def poll(self): logger.info("Task {} ended with state {}". format(self.name, self.state)) + def poll(self): + """Polls and updates the status attributes of the supplied task""" + if self.dry_run: + return + + if not self._check_poll(): + return + + # Get current state of tasks from Balsam database + self.process.refresh_from_db() + balsam_state = self.process.state + self.runtime = self._get_time_since_balsam_submit() + + if balsam_state in models.END_STATES: + self._set_complete() + elif balsam_state in models.ACTIVE_STATES: self.state = 'RUNNING' self.workdir = self.workdir or self.process.working_directory @@ -129,6 +139,36 @@ def poll(self): "Task state returned from Balsam is not in known list of " "Balsam states. Task state is {}".format(balsam_state)) + def wait(self, timeout=None): + """Waits on completion of the task or raises TimeoutExpired exception + + Status attributes of task are updated on completion. + + Parameters + ---------- + + timeout: + Time in seconds after which a TimeoutExpired exception is raised""" + + if self.dry_run: + return + + if not self._check_poll(): + return + + # Wait on the task + start = time.time() + self.process.refresh_from_db() + while self.process.state not in models.END_STATES: + time.sleep(0.2) + self.process.refresh_from_db() + if timeout and time.time() - start > timeout: + self.runtime = self._get_time_since_balsam_submit() + raise TimeoutExpired(self.name, timeout) + + self.runtime = self._get_time_since_balsam_submit() + self._set_complete() + def kill(self, wait_time=None): """ Kills or cancels the supplied task """ @@ -152,6 +192,7 @@ class BalsamMPIExecutor(MPIExecutor): def __init__(self, auto_resources=True, allow_oversubscribe=True, central_mode=True, + zero_resource_workers=[], nodelist_env_slurm=None, nodelist_env_cobalt=None, nodelist_env_lsf=None, @@ -173,6 +214,7 @@ def __init__(self, auto_resources=True, super().__init__(auto_resources, allow_oversubscribe, central_mode, + zero_resource_workers, nodelist_env_slurm, nodelist_env_cobalt, nodelist_env_lsf, @@ -185,20 +227,19 @@ def _serial_setup(self): BalsamMPIExecutor.del_apps() BalsamMPIExecutor.del_tasks() - for calc_type in self.default_apps: - if self.default_apps[calc_type] is not None: - calc_name = self.default_apps[calc_type].name - desc = self.default_apps[calc_type].desc - full_path = self.default_apps[calc_type].full_path - self.add_app(calc_name, full_path, desc) + for app in self.apps.values(): + calc_name = app.gname + desc = app.desc + full_path = app.full_path + self.add_app(calc_name, full_path, desc) @staticmethod def del_apps(): - """Deletes all Balsam apps whose names contains .simfunc or .genfunc""" + """Deletes all Balsam apps in the libe_app namespace""" AppDef = models.ApplicationDefinition # Some error handling on deletes.... is it internal - for app_type in ['.simfunc', '.genfunc']: + for app_type in [Application.prefix]: deletion_objs = AppDef.objects.filter(name__contains=app_type) if deletion_objs: for del_app in deletion_objs.iterator(): @@ -207,8 +248,8 @@ def del_apps(): @staticmethod def del_tasks(): - """Deletes all Balsam tasks whose names contains .simfunc or .genfunc""" - for app_type in ['.simfunc', '.genfunc']: + """Deletes all Balsam tasks """ + for app_type in [Task.prefix]: deletion_objs = models.BalsamJob.objects.filter( name__contains=app_type) if deletion_objs: @@ -216,12 +257,6 @@ def del_tasks(): logger.debug("Deleting task {}".format(del_task.name)) deletion_objs.delete() - # May be able to use union function - to combine - see queryset help. - # Eg (not tested) - # del_simfuncs = Task.objects.filter(name__contains='.simfunc') - # del_genfuncs = Task.objects.filter(name__contains='.genfunc') - # deletion_objs = deletion_objs.union() - @staticmethod def add_app(name, exepath, desc): """ Add application to Balsam database """ @@ -235,9 +270,9 @@ def add_app(name, exepath, desc): app.save() logger.debug("Added App {}".format(app.name)) - def submit(self, calc_type, num_procs=None, num_nodes=None, - ranks_per_node=None, machinefile=None, app_args=None, - stdout=None, stderr=None, stage_inout=None, + def submit(self, calc_type=None, app_name=None, num_procs=None, + num_nodes=None, ranks_per_node=None, machinefile=None, + app_args=None, stdout=None, stderr=None, stage_inout=None, hyperthreads=False, dry_run=False, wait_on_run=False, extra_args=None): """Creates a new task, and either executes or schedules to execute @@ -245,7 +280,13 @@ def submit(self, calc_type, num_procs=None, num_nodes=None, The created task object is returned. """ - app = self.default_app(calc_type) + + if app_name is not None: + app = self.get_app(app_name) + elif calc_type is not None: + app = self.default_app(calc_type) + else: + raise ExecutorException("Either app_name or calc_type must be set") # Specific to this class if machinefile is not None: @@ -280,7 +321,7 @@ def submit(self, calc_type, num_procs=None, num_nodes=None, add_task_args = {'name': task.name, 'workflow': self.workflow_name, 'user_workdir': default_workdir, - 'application': app.name, + 'application': app.gname, 'args': task.app_args, 'num_nodes': num_nodes, 'ranks_per_node': ranks_per_node, @@ -295,7 +336,7 @@ def submit(self, calc_type, num_procs=None, num_nodes=None, if dry_run: task.dry_run = True logger.info('Test (No submit) Runline: {}'.format(' '.join(add_task_args))) - task.set_as_complete() + task._set_complete(dry_run=True) else: task.process = dag.add_job(**add_task_args) diff --git a/libensemble/executors/executor.py b/libensemble/executors/executor.py index 02ca62ad4..3ad5adb9c 100644 --- a/libensemble/executors/executor.py +++ b/libensemble/executors/executor.py @@ -49,6 +49,16 @@ class ExecutorException(Exception): "Raised for any exception in the Executor" +class TimeoutExpired(Exception): + """Timeout exception raised when Timeout expires""" + def __init__(self, task, timeout): + self.task = task + self.timeout = timeout + + def __str__(self): + return ("Task {} timed out after {} seconds".format(self.task, self.timeout)) + + def jassert(test, *args): "Version of assert that raises a ExecutorException" if not test: @@ -59,7 +69,9 @@ class Application: """An application is an executable user-program (e.g., implementing a sim/gen function).""" - def __init__(self, full_path, calc_type='sim', desc=None): + prefix = 'libe_app' + + def __init__(self, full_path, name=None, calc_type='sim', desc=None): """Instantiates a new Application instance.""" self.full_path = full_path self.calc_type = calc_type @@ -67,10 +79,9 @@ def __init__(self, full_path, calc_type='sim', desc=None): if self.exe.endswith('.py'): self.full_path = ' '.join((sys.executable, full_path)) - - # Use this name to delete tasks in database - see del_apps(), del_tasks() - self.name = self.exe + '.' + self.calc_type + 'func' - self.desc = desc or (self.exe + ' ' + self.calc_type + ' function') + self.name = name or self.exe + self.desc = desc or (self.exe + ' app') + self.gname = '_'.join([Application.prefix, self.name]) class Task: @@ -79,6 +90,7 @@ class Task: """ + prefix = 'libe_task' newid = itertools.count() def __init__(self, app=None, app_args=None, workdir=None, @@ -104,7 +116,7 @@ def __init__(self, app=None, app_args=None, workdir=None, format(self.id)) worker_name = "_worker{}".format(self.workerID) if self.workerID else "" - self.name = "task_{}{}_{}".format(app.name, worker_name, self.id) + self.name = Task.prefix + "_{}{}_{}".format(app.name, worker_name, self.id) self.stdout = stdout or self.name + '.out' self.stderr = stderr or self.name + '.err' self.workdir = workdir @@ -123,11 +135,6 @@ def reset(self): self.runtime = 0 # Time since task started to latest poll (or finished). self.total_time = None # Time from task submission until polled as finished. - def set_as_complete(self): - self.finished = True - self.success = True - self.state = 'FINISHED' - def workdir_exists(self): """Returns true if the task's workdir exists""" return self.workdir and os.path.exists(self.workdir) @@ -174,24 +181,38 @@ def calc_task_timing(self): self.runtime = self.timer.elapsed self.total_time = self.runtime # For direct launched tasks - def check_poll(self): + def _check_poll(self): """Check whether polling this task makes sense.""" jassert(self.process is not None, "Polled task {} has no process ID - check tasks been launched". format(self.name)) if self.finished: - logger.warning("Polled task {} has already finished. " - "Not re-polling. Status is {}". - format(self.name, self.state)) + logger.debug("Polled task {} has already finished. " + "Not re-polling. Status is {}". + format(self.name, self.state)) return False return True + def _set_complete(self, dry_run=False): + """Set task as complete""" + self.finished = True + if dry_run: + self.success = True + self.state = 'FINISHED' + else: + self.calc_task_timing() + self.errcode = self.process.returncode + self.success = (self.errcode == 0) + self.state = 'FINISHED' if self.success else 'FAILED' + logger.info("Task {} finished with errcode {} ({})". + format(self.name, self.errcode, self.state)) + def poll(self): """Polls and updates the status attributes of the task""" if self.dry_run: return - if not self.check_poll(): + if not self._check_poll(): return # Poll the task @@ -201,15 +222,31 @@ def poll(self): self.runtime = self.timer.elapsed return - self.finished = True - self.calc_task_timing() + self._set_complete() - # Want to be more fine-grained about non-success (fail vs user kill?) - self.errcode = self.process.returncode - self.success = (self.errcode == 0) - self.state = 'FINISHED' if self.success else 'FAILED' - logger.info("Task {} finished with errcode {} ({})". - format(self.name, self.errcode, self.state)) + def wait(self, timeout=None): + """Waits on completion of the task or raises TimeoutExpired exception + + Status attributes of task are updated on completion. + + Parameters + ---------- + + timeout: + Time in seconds after which a TimeoutExpired exception is raised""" + + if self.dry_run: + return + + if not self._check_poll(): + return + + # Wait on the task + rc = launcher.wait(self.process, timeout) + if rc is None: + raise TimeoutExpired(self.name, timeout) + + self._set_complete() def kill(self, wait_time=60): """Kills or cancels the supplied task @@ -295,6 +332,7 @@ def __init__(self): self.top_level_dir = os.getcwd() self.manager_signal = 'none' self.default_apps = {'sim': None, 'gen': None} + self.apps = {} self.wait_time = 60 self.list_of_tasks = [] @@ -314,36 +352,53 @@ def gen_default_app(self): """Returns the default generator app""" return self.default_apps['gen'] + def get_app(self, app_name): + """Gets the app for a given app_name or raise exception""" + try: + app = self.apps[app_name] + except KeyError: + app_keys = list(self.apps.keys()) + raise ExecutorException("Application {} not found in registry".format(app_name), + "Registered applications: {}".format(app_keys)) + return app + def default_app(self, calc_type): - "Gets the default app for a given calc type." + """Gets the default app for a given calc type""" app = self.default_apps.get(calc_type) jassert(calc_type in ['sim', 'gen'], "Unrecognized calculation type", calc_type) jassert(app, "Default {} app is not set".format(calc_type)) return app - def register_calc(self, full_path, calc_type='sim', desc=None): + def register_calc(self, full_path, app_name=None, calc_type=None, desc=None): """Registers a user application to libEnsemble Parameters ---------- + app_name: String + Name to identify this application. + full_path: String The full path of the user application to be registered calc_type: String - Calculation type: Is this application part of a 'sim' - or 'gen' function + Calculation type: Set this application as the default 'sim' + or 'gen' function. desc: String, optional Description of this application """ - jassert(calc_type in self.default_apps, - "Unrecognized calculation type", calc_type) - jassert(self.default_apps[calc_type] is None, - "Default {} app already set".format(calc_type)) - self.default_apps[calc_type] = Application(full_path, calc_type, desc) + if not app_name: + app_name = os.path.split(full_path)[1] + self.apps[app_name] = Application(full_path, app_name, calc_type, desc) + + # Default sim/gen apps will be deprecated. Just use names. + if calc_type is not None: + jassert(calc_type in self.default_apps, + "Unrecognized calculation type", calc_type) + self.default_apps[calc_type] = self.apps[app_name] def manager_poll(self, comm): """ Polls for a manager signal @@ -351,6 +406,7 @@ def manager_poll(self, comm): The executor manager_signal attribute will be updated. """ + self.manager_signal = 'none' # Reset # Check for messages; disregard anything but a stop signal if not comm.mail_flag(): diff --git a/libensemble/executors/mpi_executor.py b/libensemble/executors/mpi_executor.py index f4b93796e..e3354c855 100644 --- a/libensemble/executors/mpi_executor.py +++ b/libensemble/executors/mpi_executor.py @@ -14,7 +14,7 @@ import libensemble.utils.launcher as launcher from libensemble.resources.mpi_resources import MPIResources -from libensemble.executors.executor import Executor, Task +from libensemble.executors.executor import Executor, Task, ExecutorException from libensemble.executors.mpi_runner import MPIRunner logger = logging.getLogger(__name__) @@ -29,6 +29,7 @@ class MPIExecutor(Executor): def __init__(self, auto_resources=True, allow_oversubscribe=True, central_mode=False, + zero_resource_workers=[], nodelist_env_slurm=None, nodelist_env_cobalt=None, nodelist_env_lsf=None, @@ -63,6 +64,9 @@ def __init__(self, auto_resources=True, grouped together and do not share nodes with applications. Distributed mode means workers share nodes with applications. + zero_resource_workers: list of ints, optional + List of workers that require no resources. + nodelist_env_slurm: String, optional The environment variable giving a node list in Slurm format (Default: Uses SLURM_NODELIST). Note: This is queried only if @@ -118,6 +122,7 @@ def __init__(self, auto_resources=True, self.resources = \ MPIResources(top_level_dir=self.top_level_dir, central_mode=central_mode, + zero_resource_workers=zero_resource_workers, allow_oversubscribe=allow_oversubscribe, launcher=self.mpi_runner.run_command, cores_on_node=cores_on_node, @@ -175,9 +180,9 @@ def _launch_with_retries(self, task, runline, subgroup_launch, wait_on_run): else: break - def submit(self, calc_type, num_procs=None, num_nodes=None, - ranks_per_node=None, machinefile=None, app_args=None, - stdout=None, stderr=None, stage_inout=None, + def submit(self, calc_type=None, app_name=None, num_procs=None, + num_nodes=None, ranks_per_node=None, machinefile=None, + app_args=None, stdout=None, stderr=None, stage_inout=None, hyperthreads=False, dry_run=False, wait_on_run=False, extra_args=None): """Creates a new task, and either executes or schedules execution. @@ -187,8 +192,12 @@ def submit(self, calc_type, num_procs=None, num_nodes=None, Parameters ---------- - calc_type: String + calc_type: String, optional The calculation type: 'sim' or 'gen' + Only used if app_name is not supplied. Uses default sim or gen application. + + app_name: String, optional + The application name. Must be supplied if calc_type is not. num_procs: int, optional The total number of MPI tasks on which to submit the task @@ -247,7 +256,13 @@ def submit(self, calc_type, num_procs=None, num_nodes=None, then the available resources will be divided among workers. """ - app = self.default_app(calc_type) + if app_name is not None: + app = self.get_app(app_name) + elif calc_type is not None: + app = self.default_app(calc_type) + else: + raise ExecutorException("Either app_name or calc_type must be set") + default_workdir = os.getcwd() task = Task(app, app_args, default_workdir, stdout, stderr, self.workerID) @@ -274,7 +289,7 @@ def submit(self, calc_type, num_procs=None, num_nodes=None, if dry_run: task.dry_run = True logger.info('Test (No submit) Runline: {}'.format(' '.join(runline))) - task.set_as_complete() + task._set_complete(dry_run=True) else: # Launch Task self._launch_with_retries(task, runline, sglaunch, wait_on_run) diff --git a/libensemble/gen_funcs/aposmm_localopt_support.py b/libensemble/gen_funcs/aposmm_localopt_support.py index 2727edbb6..d277550dd 100644 --- a/libensemble/gen_funcs/aposmm_localopt_support.py +++ b/libensemble/gen_funcs/aposmm_localopt_support.py @@ -5,11 +5,12 @@ __all__ = ['LocalOptInterfacer', 'run_local_nlopt', 'run_local_tao', 'run_local_dfols', 'run_local_scipy_opt', 'run_external_localopt'] +import psutil import numpy as np from libensemble.message_numbers import STOP_TAG, EVAL_GEN_TAG # Only used to simulate receiving from manager from multiprocessing import Event, Process, Queue - import libensemble.gen_funcs + optimizer_list = ['petsc', 'nlopt', 'dfols', 'scipy', 'external'] optimizers = libensemble.gen_funcs.rc.aposmm_optimizers @@ -83,8 +84,8 @@ def __init__(self, user_specs, x0, f0, grad0=None): immediately after creating the class. """ - self.parent_can_read = Event() + self.parent_can_read = Event() self.comm_queue = Queue() self.child_can_read = Event() @@ -133,6 +134,7 @@ def iterate(self, data): :param grad: A numpy array of the function's gradient. :param fvec: A numpy array of the function's component values. """ + self.parent_can_read.clear() if 'grad' in data.dtype.names: @@ -149,29 +151,23 @@ def iterate(self, data): if isinstance(x_new, ErrorMsg): raise APOSMMException(x_new.x) elif isinstance(x_new, ConvergedMsg): - self.process.join() - self.comm_queue.close() - self.comm_queue.join_thread() - self.is_running = False + self.close() else: x_new = np.atleast_2d(x_new) return x_new - def destroy(self, previous_x): - - while not isinstance(previous_x, ConvergedMsg): - self.parent_can_read.clear() - if self.grad0 is None: - self.comm_queue.put((previous_x, 0*np.ones_like(self.f0),)) - else: - self.comm_queue.put((previous_x, 0*np.ones_like(self.f0), np.zeros_like(self.grad0))) - - self.child_can_read.set() - self.parent_can_read.wait() - - previous_x = self.comm_queue.get() - assert isinstance(previous_x, ConvergedMsg) + def destroy(self): + """Recursively kill any optimizer processes still running""" + if self.process.is_alive(): + process = psutil.Process(self.process.pid) + for child in process.children(recursive=True): + child.kill() + process.kill() + self.close() + + def close(self): + """Join process and close queue""" self.process.join() self.comm_queue.close() self.comm_queue.join_thread() @@ -230,13 +226,13 @@ def run_local_nlopt(user_specs, comm_queue, x0, f0, child_can_read, parent_can_r # https://nlopt.readthedocs.io/en/latest/NLopt_Reference/#return-values opt_flag = 1 elif return_val >= 5: - print("The run started from " + str(x0) + " reached it maximum number " + print("[APOSMM] The run started from " + str(x0) + " reached its maximum number " "of function evaluations: " + str(run_max_eval) + ". No point from " "this run will be ruled as a minimum! APOSMM may start a new run " "from some point in this run.") opt_flag = 0 else: - print("NLopt returned with a negative return value, which indicates an error") + print("[APOSMM] NLopt returned with a negative return value, which indicates an error") opt_flag = 0 if user_specs.get('periodic'): @@ -274,7 +270,7 @@ def run_local_scipy_opt(user_specs, comm_queue, x0, f0, child_can_read, parent_c if res['status'] in user_specs['opt_return_codes']: opt_flag = 1 else: - print("The SciPy localopt run started from " + str(x0) + " stopped" + print("[APOSMM] The SciPy localopt run started from " + str(x0) + " stopped" " without finding a local min.\nThe 'status' of the run is " + str(res['status']) + " and the message is: \"" + res['message'] + "\".\nNo point from this run will be ruled as a minimum! APOSMM may " @@ -367,7 +363,7 @@ def run_local_dfols(user_specs, comm_queue, x0, f0, child_can_read, parent_can_r if soln.flag == soln.EXIT_SUCCESS: opt_flag = 1 else: - print("The DFO-LS run started from " + str(x0) + " stopped with an exit " + print("[APOSMM] The DFO-LS run started from " + str(x0) + " stopped with an exit " "flag of " + str(soln.flag) + ". No point from this run will be " "ruled as a minimum! APOSMM may start a new run from some point " "in this run.") @@ -446,7 +442,7 @@ def run_local_tao(user_specs, comm_queue, x0, f0, child_can_read, parent_can_rea opt_flag = 1 else: # https://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/Tao/TaoGetConvergedReason.html - print("The run started from " + str(x0) + " exited with a nonpositive reason. No point from " + print("[APOSMM] The run started from " + str(x0) + " exited with a nonpositive reason. No point from " "this run will be ruled as a minimum! APOSMM may start a new run from some point in this run.") opt_flag = 0 @@ -524,7 +520,7 @@ def tao_callback_fun_grad(tao, x, g, comm_queue, child_can_read, parent_can_read def finish_queue(x_opt, opt_flag, comm_queue, parent_can_read, user_specs): if user_specs.get('print') and opt_flag: - print('Local optimum on the [0,1]^n domain', x_opt, flush=True) + print('[APOSMM] Local optimum on the [0,1]^n domain', x_opt, flush=True) comm_queue.put(ConvergedMsg(x_opt, opt_flag)) parent_can_read.set() diff --git a/libensemble/gen_funcs/persistent_aposmm.py b/libensemble/gen_funcs/persistent_aposmm.py index 9f44c439e..510bb5909 100644 --- a/libensemble/gen_funcs/persistent_aposmm.py +++ b/libensemble/gen_funcs/persistent_aposmm.py @@ -10,7 +10,8 @@ import numpy as np from scipy.spatial.distance import cdist -from math import log, gamma, pi, sqrt +from math import log, pi, sqrt +from mpmath import gamma from libensemble.gen_funcs.aposmm_localopt_support import LocalOptInterfacer, ConvergedMsg, simulate_recv_from_manager from libensemble.message_numbers import STOP_TAG, PERSIS_STOP, FINISHED_PERSISTENT_GEN_TAG @@ -45,6 +46,7 @@ def aposmm(H, persis_info, gen_specs, libE_info): - ``'grad' [n floats]``: The gradient (if available) of the objective with respect to `x`. Note: + - If any of the above fields are desired after a libEnsemble run, name them in ``gen_specs['out']``. - If intitializing APOSMM with past function values, make sure to include @@ -57,7 +59,12 @@ def aposmm(H, persis_info, gen_specs, libE_info): - ``'lb' [n floats]``: Lower bound on search domain - ``'ub' [n floats]``: Upper bound on search domain - ``'localopt_method' [str]``: Name of an NLopt, PETSc/TAO, or SciPy method - (see 'advance_local_run' below for supported methods) + (see 'advance_local_run' below for supported methods). When using a SciPy + method, must supply ``'opt_return_codes'``, a list of integers that will + be used to determine if the x produced by the localopt method should be + ruled a local minimum. (For example, SciPy's COBYLA has a 'status' of 1 if + at an optimum, but SciPy's Nelder-Mead and BFGS have a 'status' of 0 if at + an optimum.) - ``'initial_sample_size' [int]``: Number of uniformly sampled points must be returned (non-nan value) before a local opt run is started. Can be zero if no additional sampling is desired, but if zero there must be past @@ -133,107 +140,116 @@ def aposmm(H, persis_info, gen_specs, libE_info): persis_info['old_runs']: Sequence of indices of points in finished runs """ - user_specs = gen_specs['user'] - - n, n_s, rk_const, ld, mu, nu, comm, local_H = initialize_APOSMM(H, user_specs, libE_info) - local_opters, sim_id_to_child_inds, run_order, run_pts, total_runs, fields_to_pass = initialize_children(user_specs) - - if user_specs['initial_sample_size'] != 0: - # Send our initial sample. We don't need to check that n_s is large enough: - # the alloc_func only returns when the initial sample has function values. - persis_info = add_k_sample_points_to_local_H(user_specs['initial_sample_size'], user_specs, - persis_info, n, comm, local_H, - sim_id_to_child_inds) - if not user_specs.get('standalone'): - send_mgr_worker_msg(comm, local_H[-user_specs['initial_sample_size']:][[i[0] for i in gen_specs['out']]]) - something_sent = True - else: - something_sent = False - tag = None - first_pass = True - while 1: - new_opt_inds_to_send_mgr = [] - new_inds_to_send_mgr = [] + try: + user_specs = gen_specs['user'] + n, n_s, rk_const, ld, mu, nu, comm, local_H = initialize_APOSMM(H, user_specs, libE_info) + local_opters, sim_id_to_child_inds, run_order, run_pts, total_runs, fields_to_pass = initialize_children(user_specs) + if user_specs['initial_sample_size'] != 0: + # Send our initial sample. We don't need to check that n_s is large enough: + # the alloc_func only returns when the initial sample has function values. + persis_info = add_k_sample_points_to_local_H(user_specs['initial_sample_size'], user_specs, + persis_info, n, comm, local_H, + sim_id_to_child_inds) + if not user_specs.get('standalone'): + send_mgr_worker_msg(comm, local_H[-user_specs['initial_sample_size']:][[i[0] for i in gen_specs['out']]]) + something_sent = True + else: + something_sent = False - if something_sent: - if user_specs.get('standalone'): - tag, Work, calc_in = simulate_recv_from_manager(local_H, gen_specs) - else: - tag, Work, calc_in = get_mgr_worker_msg(comm) - - if tag in [STOP_TAG, PERSIS_STOP]: - clean_up_and_stop(local_H, local_opters, run_order) - persis_info['run_order'] = run_order - break - - n_s, n_r = update_local_H_after_receiving(local_H, n, n_s, user_specs, Work, calc_in, fields_to_pass) - - for row in calc_in: - if sim_id_to_child_inds.get(row['sim_id']): - # Point came from a child local opt run - for child_idx in sim_id_to_child_inds[row['sim_id']]: - x_new = local_opters[child_idx].iterate(row[fields_to_pass]) - if isinstance(x_new, ConvergedMsg): - x_opt = x_new.x - opt_flag = x_new.opt_flag - opt_ind = update_history_optimal(x_opt, opt_flag, local_H, run_order[child_idx]) - new_opt_inds_to_send_mgr.append(opt_ind) - local_opters.pop(child_idx) - else: - add_to_local_H(local_H, x_new, user_specs, local_flag=1, on_cube=True) - new_inds_to_send_mgr.append(len(local_H)-1) - - run_order[child_idx].append(local_H[-1]['sim_id']) - run_pts[child_idx].append(x_new) - if local_H[-1]['sim_id'] in sim_id_to_child_inds: - sim_id_to_child_inds[local_H[-1]['sim_id']] += (child_idx, ) + tag = None + first_pass = True + while 1: + new_opt_inds_to_send_mgr = [] + new_inds_to_send_mgr = [] + + if something_sent: + if user_specs.get('standalone'): + tag, Work, calc_in = simulate_recv_from_manager(local_H, gen_specs) + else: + tag, Work, calc_in = get_mgr_worker_msg(comm) + + if tag in [STOP_TAG, PERSIS_STOP]: + clean_up_and_stop(local_opters) + persis_info['run_order'] = run_order + break + + if np.sum(local_H['local_min']) >= user_specs.get('stop_after_this_many_minima', np.inf): + # This break happens here so the manager can be informed about the last minima. + break + + n_s, n_r = update_local_H_after_receiving(local_H, n, n_s, user_specs, Work, calc_in, fields_to_pass) + + for row in calc_in: + if sim_id_to_child_inds.get(row['sim_id']): + # Point came from a child local opt run + for child_idx in sim_id_to_child_inds[row['sim_id']]: + x_new = local_opters[child_idx].iterate(row[fields_to_pass]) + if isinstance(x_new, ConvergedMsg): + x_opt = x_new.x + opt_flag = x_new.opt_flag + opt_ind = update_history_optimal(x_opt, opt_flag, local_H, run_order[child_idx]) + new_opt_inds_to_send_mgr.append(opt_ind) + local_opters.pop(child_idx) else: - sim_id_to_child_inds[local_H[-1]['sim_id']] = (child_idx, ) + add_to_local_H(local_H, x_new, user_specs, local_flag=1, on_cube=True) + new_inds_to_send_mgr.append(len(local_H)-1) - starting_inds = decide_where_to_start_localopt(local_H, n, n_s, rk_const, ld, mu, nu) + run_order[child_idx].append(local_H[-1]['sim_id']) + run_pts[child_idx].append(x_new) + if local_H[-1]['sim_id'] in sim_id_to_child_inds: + sim_id_to_child_inds[local_H[-1]['sim_id']] += (child_idx, ) + else: + sim_id_to_child_inds[local_H[-1]['sim_id']] = (child_idx, ) - for ind in starting_inds: - if len([p for p in local_opters.values() if p.is_running]) < user_specs.get('max_active_runs', np.inf): - local_H['started_run'][ind] = 1 + starting_inds = decide_where_to_start_localopt(local_H, n, n_s, rk_const, ld, mu, nu) - # Initialize a local opt run - local_opter = LocalOptInterfacer(user_specs, local_H[ind]['x_on_cube'], - local_H[ind]['f'] if 'f' in fields_to_pass else local_H[ind]['fvec'], - local_H[ind]['grad'] if 'grad' in fields_to_pass else None) + for ind in starting_inds: + if len([p for p in local_opters.values() if p.is_running]) < user_specs.get('max_active_runs', np.inf): + local_H['started_run'][ind] = 1 - local_opters[total_runs] = local_opter + # Initialize a local opt run + local_opter = LocalOptInterfacer(user_specs, local_H[ind]['x_on_cube'], + local_H[ind]['f'] if 'f' in fields_to_pass else local_H[ind]['fvec'], + local_H[ind]['grad'] if 'grad' in fields_to_pass else None) - x_new = local_opter.iterate(local_H[ind][fields_to_pass]) # Assuming the second point can't be ruled optimal + local_opters[total_runs] = local_opter - add_to_local_H(local_H, x_new, user_specs, local_flag=1, on_cube=True) - new_inds_to_send_mgr.append(len(local_H)-1) + x_new = local_opter.iterate(local_H[ind][fields_to_pass]) # Assuming the second point can't be ruled optimal - run_order[total_runs] = [ind, local_H[-1]['sim_id']] - run_pts[total_runs] = [local_H['x_on_cube'], x_new] + add_to_local_H(local_H, x_new, user_specs, local_flag=1, on_cube=True) + new_inds_to_send_mgr.append(len(local_H)-1) - if local_H[-1]['sim_id'] in sim_id_to_child_inds: - sim_id_to_child_inds[local_H[-1]['sim_id']] += (total_runs, ) - else: - sim_id_to_child_inds[local_H[-1]['sim_id']] = (total_runs, ) + run_order[total_runs] = [ind, local_H[-1]['sim_id']] + run_pts[total_runs] = [local_H['x_on_cube'], x_new] - total_runs += 1 + if local_H[-1]['sim_id'] in sim_id_to_child_inds: + sim_id_to_child_inds[local_H[-1]['sim_id']] += (total_runs, ) + else: + sim_id_to_child_inds[local_H[-1]['sim_id']] = (total_runs, ) - if first_pass: - num_samples_needed = persis_info['nworkers'] - 1 - len(new_inds_to_send_mgr) - first_pass = False - else: - num_samples_needed = n_r-len(new_inds_to_send_mgr) + total_runs += 1 + + if first_pass: + num_samples_needed = persis_info['nworkers'] - 1 - len(new_inds_to_send_mgr) + first_pass = False + else: + num_samples_needed = n_r-len(new_inds_to_send_mgr) - if num_samples_needed > 0: - persis_info = add_k_sample_points_to_local_H(num_samples_needed, user_specs, persis_info, n, comm, local_H, sim_id_to_child_inds) - new_inds_to_send_mgr = new_inds_to_send_mgr + list(range(len(local_H)-num_samples_needed, len(local_H))) + if num_samples_needed > 0: + persis_info = add_k_sample_points_to_local_H(num_samples_needed, user_specs, persis_info, n, comm, local_H, sim_id_to_child_inds) + new_inds_to_send_mgr = new_inds_to_send_mgr + list(range(len(local_H)-num_samples_needed, len(local_H))) - if not user_specs.get('standalone'): - send_mgr_worker_msg(comm, local_H[new_inds_to_send_mgr + new_opt_inds_to_send_mgr][[i[0] for i in gen_specs['out']]]) - something_sent = True + if not user_specs.get('standalone'): + send_mgr_worker_msg(comm, local_H[new_inds_to_send_mgr + new_opt_inds_to_send_mgr][[i[0] for i in gen_specs['out']]]) + something_sent = True - return local_H, persis_info, FINISHED_PERSISTENT_GEN_TAG + return local_H, persis_info, FINISHED_PERSISTENT_GEN_TAG + finally: + try: + clean_up_and_stop(local_opters) + except NameError: + pass def update_local_H_after_receiving(local_H, n, n_s, user_specs, Work, calc_in, fields_to_pass): @@ -397,19 +413,19 @@ def update_history_optimal(x_opt, opt_flag, H, run_inds): # assert dists[ind] <= tol_x1, "Closest point to x_opt not within {}?".format(tol_x1) if dists[ind] > tol_x1: - print("Dist from reported x_opt to closest evaluated point is: " + str(dists[ind]) + "\n" + - "Check that the local optimizer is working correctly\n", x_opt, run_inds, flush=True) + print("[APOSMM] Dist from reported x_opt to closest evaluated point is: " + str(dists[ind]) + "\n" + + "[APOSMM] Check that the local optimizer is working correctly\n", x_opt, run_inds, flush=True) tol_x2 = 1e-8 failsafe = np.logical_and(H['f'][run_inds] < H['f'][opt_ind], dists < tol_x2) if opt_flag: if np.any(failsafe): - print("This run has {} point(s) with smaller 'f' value within {} of " + print("[APOSMM] This run has {} point(s) with smaller 'f' value within {} of " "the point ruled to be the run minimum. \nMarking all as being " "a 'local_min' to prevent APOSMM from starting another run " "immediately from these points.".format(sum(failsafe), tol_x2)) - print("Sim_ids to be marked optimal: ", opt_ind, run_inds[failsafe]) - print("Check that the local optimizer is working correctly", flush=True) + print("[APOSMM] Sim_ids to be marked optimal: ", opt_ind, run_inds[failsafe]) + print("[APOSMM] Check that the local optimizer is working correctly", flush=True) H['local_min'][run_inds[failsafe]] = 1 H['local_min'][opt_ind] = 1 @@ -611,7 +627,7 @@ def initialize_APOSMM(H, user_specs, libE_info): over_written_fields = ['dist_to_unit_bounds', 'dist_to_better_l', 'dist_to_better_s', 'ind_of_better_l', 'ind_of_better_s'] if any([i in H.dtype.names for i in over_written_fields]): - print("\n persistent_aposmm ignores any given values in these fields: " + str(over_written_fields) + "\n") + print("\n[APOSMM] persistent_aposmm ignores any given values in these fields: " + str(over_written_fields) + "\n") initialize_dists_and_inds(local_H, len(H)) @@ -666,14 +682,15 @@ def add_k_sample_points_to_local_H(k, user_specs, persis_info, n, comm, local_H, return persis_info -def clean_up_and_stop(local_H, local_opters, run_order): +# def clean_up_and_stop(local_H, local_opters): +def clean_up_and_stop(local_opters): # FIXME: This has to be a clean exit. # print('[Parent]: The optimal points and values are:\n', # local_H[np.where(local_H['local_min'])][['x', 'f']], flush=True) for i, p in local_opters.items(): - p.destroy(local_H['x_on_cube'][run_order[i][-1]]) + p.destroy() # def display_exception(e): diff --git a/libensemble/gen_funcs/persistent_deap_nsga2.py b/libensemble/gen_funcs/persistent_deap_nsga2.py index a71ee20fb..d448b986b 100644 --- a/libensemble/gen_funcs/persistent_deap_nsga2.py +++ b/libensemble/gen_funcs/persistent_deap_nsga2.py @@ -16,11 +16,8 @@ from libensemble.tools.gen_support import sendrecv_mgr_worker_msg -def uniform(low, up, size=None): - try: - return [np.random.uniform(a, b) for a, b in zip(low, up)] - except TypeError: - return [np.random.uniform(a, b) for a, b in zip([low] * size, [up] * size)] +def uniform(low, up): + return [np.random.uniform(a, b) for a, b in zip(low, up)] def nsga2_toolbox(gen_specs): @@ -33,13 +30,22 @@ def nsga2_toolbox(gen_specs): inp = gen_specs['user']['indpb'] lb = gen_specs['user']['lb'] ub = gen_specs['user']['ub'] - dim = gen_specs['user']['indiv_size'] + + try: + del creator.MyFitness + except Exception: + pass + + try: + del creator.Individual + except Exception: + pass creator.create('MyFitness', base.Fitness, weights=w) creator.create('Individual', array.array, typecode='d', fitness=creator.MyFitness) toolbox = base.Toolbox() - toolbox.register('attr_float', uniform, lb, ub, dim) + toolbox.register('attr_float', uniform, lb, ub) toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.attr_float) toolbox.register('population', tools.initRepeat, list, toolbox.individual) @@ -68,11 +74,8 @@ def evaluate_pop(g, deap_object, Out, comm): for i, ind in enumerate(deap_object): # Attaching fitness values from sim to population # i.e. replacing values with those generated by the sim - - if isinstance(calc_in['fitness_values'][i], tuple): - ind.fitness.values = calc_in['fitness_values'][i] - else: - ind.fitness.values = (calc_in['fitness_values'][i],) + fvals = calc_in['fitness_values'][i] + ind.fitness.values = [fvals] if isinstance(fvals, float) else list(fvals) return deap_object, tag @@ -82,27 +85,36 @@ def deap_nsga2(H, persis_info, gen_specs, libE_info): An implementation of the NSGA2 evolutionary algorithm. ''' # Check to make sure boundaries are list, not array - if isinstance(gen_specs['user']['lb'], list): - if isinstance(gen_specs['user']['ub'], list): - pass - else: - print('Lower or Upper bound is not a list') - print('This will break DEAP crossover function') - assert isinstance(gen_specs['user']['lb'], list), "lb is wrong type" - assert isinstance(gen_specs['user']['ub'], list), "ub is wrong type" + assert isinstance(gen_specs['user']['lb'], list), "lb is wrong type, must be a list!" + assert isinstance(gen_specs['user']['ub'], list), "ub is wrong type, must be a list!" # Initialize NSGA2 DEAP toolbox toolbox = nsga2_toolbox(gen_specs) - - g = 0 # generation count + pop_size = gen_specs['user']['pop_size'] # CXPB is the probability with which two individuals are crossed - MU, CXPB = gen_specs['user']['pop_size'], gen_specs['user']['cxpb'] - pop = toolbox.population(n=MU) # MU is Population size ( # of individuals) + MU, CXPB = pop_size, gen_specs['user']['cxpb'] comm = libE_info['comm'] - - # Running fitness calc for first generation - Out = np.zeros(gen_specs['user']['pop_size'], dtype=gen_specs['out']) - pop, tag = evaluate_pop(g, pop, Out, comm) + pop = toolbox.population(n=MU) # MU is Population size ( # of individuals) + Out = np.zeros(pop_size, dtype=gen_specs['out']) + + if len(H): + tag = None + g = max(H['generation']) + individuals = H['individual'][-pop_size:] + fvals = H['fitness_values'][-pop_size:] + print("Loading initial collection of points as generation ", g, '.') + + for i, ind in enumerate(pop): + # Fill in first pop and output with provided points + ind[:] = array.array('d', individuals[i]) + ind.fitness.values = [fvals[i]] if isinstance(fvals[i], float) else list(fvals[i]) + Out['individual'][i] = individuals[i] + Out['generation'][i] = g + else: + print('No initial sample provided, starting from scratch.') + g = 0 # generation count + # Running fitness calc for first generation + pop, tag = evaluate_pop(g, pop, Out, comm) # This is just to assign the crowding distance to the individuals # no actual selection is done diff --git a/libensemble/gen_funcs/vtmop/vtmop_initializer.f90 b/libensemble/gen_funcs/vtmop/vtmop_initializer.f90 index ab41dc41b..217d97fde 100644 --- a/libensemble/gen_funcs/vtmop/vtmop_initializer.f90 +++ b/libensemble/gen_funcs/vtmop/vtmop_initializer.f90 @@ -40,7 +40,7 @@ PROGRAM VTMOP_INITIALIZER CLOSE(12) ! Initialize the VTMOP status object. -CALL VTMOP_INIT( VTMOP, D, P, LB, UB, IERR, TRUST_RAD=TRUST_RAD, ICHKPT=1 ) +CALL VTMOP_INIT( VTMOP, D, P, LB, UB, IERR, TRUST_RADF=TRUST_RAD, ICHKPT=1 ) IF (IERR .NE. 0) THEN WRITE(ERROR_UNIT, "(A,I4)") & "An error occurred while initializing. Error code: ", IERR diff --git a/libensemble/libE.py b/libensemble/libE.py index daee5b55b..ef1f990e7 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -309,6 +309,15 @@ def libE_local(sim_specs, gen_specs, exit_criteria, hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) + # On Python 3.8 on macOS, the default start method for new processes was + # switched to 'spawn' by default due to 'fork' potentially causing crashes. + # These crashes haven't yet been observed with libE, but with 'spawn' runs, + # warnings about leaked semaphore objects are displayed instead. + # The next several statements enforce 'fork' on macOS (Python 3.8) + if os.uname().sysname == 'Darwin': + from multiprocessing import set_start_method + set_start_method('fork', force=True) + # Launch worker team and set up logger wcomms = start_proc_team(nworkers, sim_specs, gen_specs, libE_specs) diff --git a/libensemble/libE_manager.py b/libensemble/libE_manager.py index 0d9bed2a5..452c37798 100644 --- a/libensemble/libE_manager.py +++ b/libensemble/libE_manager.py @@ -20,11 +20,14 @@ MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL from libensemble.comms.comms import CommFinishedException from libensemble.libE_worker import WorkerErrMsg -from libensemble.tools.tools import _USER_SIM_DIR_WARNING -from libensemble.tools.fields_keys import libE_spec_calc_dir_keys +from libensemble.tools.tools import _USER_CALC_DIR_WARNING +from libensemble.tools.fields_keys import libE_spec_calc_dir_combined import cProfile import pstats +if tuple(np.__version__.split('.')) >= ('1', '15'): + from numpy.lib.recfunctions import repack_fields + logger = logging.getLogger(__name__) # For debug messages - uncomment # logger.setLevel(logging.DEBUG) @@ -145,7 +148,7 @@ def __init__(self, hist, libE_specs, alloc_specs, (1, 'gen_max', self.term_test_gen_max), (1, 'stop_val', self.term_test_stop_val)] - if any([setting in self.libE_specs for setting in libE_spec_calc_dir_keys]): + if any([setting in self.libE_specs for setting in libE_spec_calc_dir_combined]): self.check_ensemble_dir(libE_specs) if libE_specs.get('ensemble_copy_back', False): Manager.make_copyback_dir(libE_specs) @@ -165,7 +168,7 @@ def check_ensemble_dir(self, libE_specs): except FileNotFoundError: # Ensemble dir doesn't exist. pass except OSError as e: # Ensemble dir exists and isn't empty. - logger.manager_warning(_USER_SIM_DIR_WARNING.format(prefix)) + logger.manager_warning(_USER_CALC_DIR_WARNING.format(prefix)) self._kill_workers() raise ManagerException('Manager errored on initialization', 'Ensemble directory already existed and wasn\'t empty.', e) @@ -258,7 +261,10 @@ def _send_work_order(self, Work, w): self.wcomms[w-1].send(Work['tag'], Work) work_rows = Work['libE_info']['H_rows'] if len(work_rows): - self.wcomms[w-1].send(0, self.hist.H[Work['H_fields']][work_rows]) + if 'repack_fields' in dir(): + self.wcomms[w-1].send(0, repack_fields(self.hist.H[Work['H_fields']][work_rows])) + else: + self.wcomms[w-1].send(0, self.hist.H[Work['H_fields']][work_rows]) def _update_state_on_alloc(self, Work, w): """Updates a workers' active/idle status following an allocation order""" diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index 0f327e837..2d48ef597 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -7,6 +7,7 @@ import logging import os import shutil +import re import logging.handlers from itertools import count, groupby from operator import itemgetter @@ -19,7 +20,7 @@ UNSET_TAG, STOP_TAG, PERSIS_STOP, CALC_EXCEPTION from libensemble.message_numbers import MAN_SIGNAL_FINISH from libensemble.message_numbers import calc_type_strings, calc_status_strings -from libensemble.tools.fields_keys import libE_spec_calc_dir_keys +from libensemble.tools.fields_keys import libE_spec_sim_dir_keys, libE_spec_gen_dir_keys from libensemble.utils.loc_stack import LocationStack from libensemble.utils.timer import Timer @@ -148,24 +149,30 @@ def __init__(self, comm, dtypes, workerID, sim_specs, gen_specs, libE_specs): def _make_calc_dir(libE_specs, workerID, H_rows, calc_str, locs): "Create calc dirs and intermediate dirs, copy inputs, based on libE_specs" - sim_input_dir = libE_specs.get('sim_input_dir', '').rstrip('/') + if calc_str == 'sim': + calc_input_dir = libE_specs.get('sim_input_dir', '').rstrip('/') + do_calc_dirs = libE_specs.get('sim_dirs_make', True) + copy_files = libE_specs.get('sim_dir_copy_files', []) + symlink_files = libE_specs.get('sim_dir_symlink_files', []) + else: # calc_str is 'gen' + calc_input_dir = libE_specs.get('gen_input_dir', '').rstrip('/') + do_calc_dirs = libE_specs.get('gen_dirs_make', True) + copy_files = libE_specs.get('gen_dir_copy_files', []) + symlink_files = libE_specs.get('gen_dir_symlink_files', []) - do_sim_dirs = libE_specs.get('sim_dirs_make', True) prefix = libE_specs.get('ensemble_dir_path', './ensemble') - copy_files = libE_specs.get('sim_dir_copy_files', []) - symlink_files = libE_specs.get('sim_dir_symlink_files', []) do_work_dirs = libE_specs.get('use_worker_dirs', False) - # If using sim_input_dir, set of files to copy is contents of provided dir - if sim_input_dir: - copy_files = set(copy_files + [os.path.join(sim_input_dir, i) for i in os.listdir(sim_input_dir)]) + # If using calc_input_dir, set of files to copy is contents of provided dir + if calc_input_dir: + copy_files = set(copy_files + [os.path.join(calc_input_dir, i) for i in os.listdir(calc_input_dir)]) # If identical paths to copy and symlink, remove those paths from symlink_files if len(symlink_files): symlink_files = [i for i in symlink_files if i not in copy_files] # Cases where individual sim_dirs not created. - if not do_sim_dirs: + if not do_calc_dirs: if do_work_dirs: # Each worker does work in worker dirs key = workerID dir = "worker" + str(workerID) @@ -257,41 +264,71 @@ def _extract_H_ranges(Work): return '_'.join(ranges) def _copy_back(self): - """ Cleanup indication file & copy output to init dir, if specified""" + """Copy back all ensemble dir contents to launch location""" if os.path.isdir(self.prefix) and self.libE_specs.get('ensemble_copy_back', False): + no_calc_dirs = not self.libE_specs.get('sim_dirs_make', True) or \ + not self.libE_specs.get('gen_dirs_make', True) + ensemble_dir_path = self.libE_specs.get('ensemble_dir_path', './ensemble') copybackdir = os.path.basename(ensemble_dir_path) + if os.path.relpath(ensemble_dir_path) == os.path.relpath(copybackdir): copybackdir += '_back' + for dir in self.loc_stack.dirs.values(): - try: - shutil.copytree(dir, os.path.join(copybackdir, os.path.basename(dir)), symlinks=True) - if os.path.basename(dir).startswith('worker'): - break # Worker dir (with all sim_dirs) copied. - except FileExistsError: - if not self.libE_specs.get('sim_dirs_make', True): + dest_path = os.path.join(copybackdir, os.path.basename(dir)) + if dir == self.prefix: # occurs when no_calc_dirs is True + continue # otherwise, entire ensemble dir copied into copyback dir + + shutil.copytree(dir, dest_path, symlinks=True) + if os.path.basename(dir).startswith('worker'): + return # Worker dir (with all contents) has been copied. + + # If not using calc dirs, likely miscellaneous files to copy back + if no_calc_dirs: + p = re.compile(r"((^sim)|(^gen))\d+_worker\d+") + for file in [i for i in os.listdir(self.prefix) if not p.match(i)]: # each non-calc_dir file + source_path = os.path.join(self.prefix, file) + dest_path = os.path.join(copybackdir, file) + try: + if os.path.isdir(source_path): + shutil.copytree(source_path, dest_path, symlinks=True) + else: + shutil.copy(source_path, dest_path, follow_symlinks=False) + except FileExistsError: + continue + except shutil.SameFileError: # creating an identical symlink continue - else: - raise def _determine_dir_then_calc(self, Work, calc_type, calc_in, calc): - "Determines choice for sim_dir structure, then performs calculation." + "Determines choice for calc_dir structure, then performs calculation." if not self.loc_stack: self.loc_stack = LocationStack() - H_rows = Worker._extract_H_ranges(Work) + if calc_type == EVAL_SIM_TAG: + H_rows = Worker._extract_H_ranges(Work) + else: + H_rows = str(self.calc_iter[calc_type]) + calc_str = calc_type_strings[calc_type] - if any([setting in self.libE_specs for setting in libE_spec_calc_dir_keys]): - calc_dir = Worker._make_calc_dir(self.libE_specs, self.workerID, - H_rows, calc_str, self.loc_stack) + calc_dir = Worker._make_calc_dir(self.libE_specs, self.workerID, + H_rows, calc_str, self.loc_stack) + + with self.loc_stack.loc(calc_dir): # Switching to calc_dir + return calc(calc_in, Work['persis_info'], Work['libE_info']) - with self.loc_stack.loc(calc_dir): # Switching to calc_dir - return calc(calc_in, Work['persis_info'], Work['libE_info']) + def _use_calc_dirs(self, type): + "Determines calc_dirs enabling for each calc type" + + if type == EVAL_SIM_TAG: + dir_type_keys = libE_spec_sim_dir_keys + else: + dir_type_keys = libE_spec_gen_dir_keys - return calc(calc_in, Work['persis_info'], Work['libE_info']) + return any([setting in self.libE_specs for setting in dir_type_keys]) def _handle_calc(self, Work, calc_in): """Runs a calculation on this worker object. @@ -322,7 +359,7 @@ def _handle_calc(self, Work, calc_in): with timer: logger.debug("Calling calc {}".format(calc_type)) - if calc_type == EVAL_SIM_TAG: + if self._use_calc_dirs(calc_type): out = self._determine_dir_then_calc(Work, calc_type, calc_in, calc) else: out = calc(calc_in, Work['persis_info'], Work['libE_info']) diff --git a/libensemble/resources/env_resources.py b/libensemble/resources/env_resources.py index 333e114eb..22608a5a0 100644 --- a/libensemble/resources/env_resources.py +++ b/libensemble/resources/env_resources.py @@ -23,8 +23,8 @@ class EnvResources: These are set on initialization. - :ivar dict nodelists: Environment variable names to query for nodelists by schedular - :ivar dict ndlist_funcs: Functions to extract nodelists from environment by schedular + :ivar dict nodelists: Environment variable names to query for nodelists by scheduler + :ivar dict ndlist_funcs: Functions to extract nodelists from environment by scheduler """ default_nodelist_env_slurm = 'SLURM_NODELIST' @@ -63,7 +63,7 @@ def __init__(self, Note: This is queried only if a node_list file is not provided and auto_resources=True. """ - self.schedular = None + self.scheduler = None self.nodelists = {} self.nodelists['Slurm'] = nodelist_env_slurm or EnvResources.default_nodelist_env_slurm self.nodelists['Cobalt'] = nodelist_env_cobalt or EnvResources.default_nodelist_env_cobalt @@ -76,23 +76,41 @@ def __init__(self, self.ndlist_funcs['LSF'] = EnvResources.get_lsf_nodelist self.ndlist_funcs['LSF_shortform'] = EnvResources.get_lsf_nodelist_frm_shortform - def get_nodelist(self): - """Returns nodelist from environment or an empty list""" for env, env_var in self.nodelists.items(): if os.environ.get(env_var): - self.schedular = env - logger.debug("{0} env found - getting nodelist from {0}".format(env)) - get_list_func = self.ndlist_funcs[env] - global_nodelist = get_list_func(env_var) - return global_nodelist + self.scheduler = env + break + + def get_nodelist(self): + """Returns nodelist from environment or an empty list""" + if self.scheduler: + env = self.scheduler + env_var = self.nodelists[env] + logger.debug("{} env found - getting nodelist from {}".format(env, env_var)) + get_list_func = self.ndlist_funcs[env] + global_nodelist = get_list_func(env_var) + return global_nodelist return [] - def abbrev_nodenames(self, node_list): + @staticmethod + def abbrev_nodenames(node_list, prefix=None): + """Returns nodelist with only string upto first dot""" + newlist = [s.split(".", 1)[0] for s in node_list] + return newlist + + @staticmethod + def cobalt_abbrev_nodenames(node_list, prefix='nid'): + """Returns nodelist with prefix and leading zeros stripped""" + newlist = [s.lstrip(prefix) for s in node_list] + newlist = [s.lstrip('0') for s in newlist] + return newlist + + def shortnames(self, node_list): """Returns nodelist with entries in abbreviated form""" - if self.schedular == 'Slurm': - return EnvResources.slurm_abbrev_nodenames(node_list) - if self.schedular == 'Cobalt': + if self.scheduler == 'Cobalt': return EnvResources.cobalt_abbrev_nodenames(node_list) + elif self.scheduler is not None: + return EnvResources.abbrev_nodenames(node_list) return node_list @staticmethod @@ -156,19 +174,6 @@ def get_cobalt_nodelist(node_list_env): nidlst.append(str(nid)) return sorted(nidlst, key=int) - @staticmethod - def slurm_abbrev_nodenames(node_list, prefix=None): - """Returns nodelist with only string upto first dot""" - newlist = [s.split(".", 1)[0] for s in node_list] - return newlist - - @staticmethod - def cobalt_abbrev_nodenames(node_list, prefix='nid'): - """Returns nodelist with prefix and leading zeros stripped""" - newlist = [s.lstrip(prefix) for s in node_list] - newlist = [s.lstrip('0') for s in newlist] - return newlist - @staticmethod def get_lsf_nodelist(node_list_env): """Gets global libEnsemble nodelist from the LSF environment""" diff --git a/libensemble/resources/node_resources.py b/libensemble/resources/node_resources.py index ba95149a6..e0a116682 100644 --- a/libensemble/resources/node_resources.py +++ b/libensemble/resources/node_resources.py @@ -4,6 +4,7 @@ """ import os +import psutil import logging import collections @@ -12,36 +13,6 @@ REMOTE_LAUNCH_LIST = ['aprun', 'jsrun', 'srun'] # Move to feature of mpi_runner -def _open_binary(fname, **kwargs): - return open(fname, "rb", **kwargs) - - -def _cpu_count_physical(): - """Returns the number of physical cores on the node""" - mapping = {} - current_info = {} - # macOS method for physical cpus - if os.uname().sysname == 'Darwin': - return int(os.popen('sysctl -n hw.physicalcpu').read().strip()) - else: - with _open_binary('/proc/cpuinfo') as f: - for line in f: - line = line.strip().lower() - if not line: - # new section - if (b'physical id' in current_info and - b'cpu cores' in current_info): - mapping[current_info[b'physical id']] = current_info[b'cpu cores'] - current_info = {} - else: - if (line.startswith(b'physical id') or - line.startswith(b'cpu cores')): - key, value = line.split(b'\t:', 1) - current_info[key] = int(value) - - return sum(mapping.values()) or None - - def get_cpu_cores(hyperthreads=False): """Returns the number of cores on the node. @@ -51,23 +22,7 @@ def get_cpu_cores(hyperthreads=False): Note: This returns cores available on the current node. It will not work for systems of multiple node types """ - try: - import psutil - ranks_per_node = psutil.cpu_count(logical=hyperthreads) - except ImportError: - # logger - if hyperthreads: - import multiprocessing - ranks_per_node = multiprocessing.cpu_count() - else: - try: - ranks_per_node = _cpu_count_physical() - except Exception as e: - logger.warning("Could not detect physical cores - Logical cores (with hyperthreads) " - "returned - " "specify ranks_per_node to override. Exception {}".format(e)) - import multiprocessing - ranks_per_node = multiprocessing.cpu_count() - return ranks_per_node # This is ranks available per node + return psutil.cpu_count(logical=hyperthreads) # This is ranks available per node def _get_local_cpu_resources(): diff --git a/libensemble/resources/resources.py b/libensemble/resources/resources.py index 5882a1421..249da5220 100644 --- a/libensemble/resources/resources.py +++ b/libensemble/resources/resources.py @@ -43,6 +43,7 @@ class Resources: def __init__(self, top_level_dir=None, central_mode=False, + zero_resource_workers=[], allow_oversubscribe=False, launcher=None, cores_on_node=None, @@ -69,6 +70,9 @@ def __init__(self, top_level_dir=None, do not share nodes with applications. Distributed mode means Workers share nodes with applications. + zero_resource_workers: list of ints, optional + List of workers that require no resources. + allow_oversubscribe: boolean, optional If false, then resources will raise an error if task process counts exceed the CPUs available to the worker, as detected by @@ -119,15 +123,15 @@ def __init__(self, top_level_dir=None, nodelist_env_lsf_shortform=nodelist_env_lsf_shortform) # This is global nodelist avail to workers - may change to global_worker_nodelist + self.local_host = self.env_resources.shortnames([socket.gethostname()])[0] if node_file is None: node_file = Resources.DEFAULT_NODEFILE self.global_nodelist = Resources.get_global_nodelist(node_file=node_file, rundir=self.top_level_dir, env_resources=self.env_resources) - self.launcher = launcher remote_detect = False - if socket.gethostname() not in self.global_nodelist: + if self.local_host not in self.global_nodelist: remote_detect = True if not cores_on_node: @@ -139,13 +143,14 @@ def __init__(self, top_level_dir=None, self.logical_cores_avail_per_node = cores_on_node[1] self.libE_nodes = None self.worker_resources = None + self.zero_resource_workers = zero_resource_workers def add_comm_info(self, libE_nodes): """Adds comms-specific information to resources Removes libEnsemble nodes from nodelist if in central_mode. """ - self.libE_nodes = self.env_resources.abbrev_nodenames(libE_nodes) + self.libE_nodes = self.env_resources.shortnames(libE_nodes) libE_nodes_in_list = list(filter(lambda x: x in self.libE_nodes, self.global_nodelist)) if libE_nodes_in_list: if self.central_mode and len(self.global_nodelist) > 1: @@ -232,6 +237,8 @@ def get_global_nodelist(node_file=DEFAULT_NODEFILE, with open(node_filepath, 'r') as f: for line in f: global_nodelist.append(line.rstrip()) + if env_resources: + global_nodelist = env_resources.shortnames(global_nodelist) else: logger.debug("No node_file found - searching for nodelist in environment") if env_resources: @@ -240,7 +247,7 @@ def get_global_nodelist(node_file=DEFAULT_NODEFILE, if not global_nodelist: # Assume a standalone machine logger.info("Can not find nodelist from environment. Assuming standalone") - global_nodelist.append(socket.gethostname()) + global_nodelist.append(env_resources.shortnames([socket.gethostname()])[0]) if global_nodelist: return global_nodelist @@ -294,6 +301,19 @@ def get_workers_on_a_node(num_workers, resources): workers_per_node = num_workers//num_nodes + (num_workers % num_nodes > 0) return workers_per_node + @staticmethod + def map_workerid_to_index(num_workers, workerID, zero_resource_list): + """Map WorkerID to index into a nodelist""" + index = workerID - 1 + if zero_resource_list: + for i in range(1, num_workers+1): + if i in zero_resource_list: + index -= 1 + if index < i: + return index + raise ResourcesException("Error mapping workerID {} to nodelist index {}".format(workerID, index)) + return index + @staticmethod def get_local_nodelist(num_workers, workerID, resources): """Returns the list of nodes available to the current worker @@ -304,37 +324,42 @@ def get_local_nodelist(num_workers, workerID, resources): global_nodelist = resources.global_nodelist num_nodes = len(global_nodelist) + zero_resource_list = resources.zero_resource_workers + num_workers_2assign2 = num_workers - len(zero_resource_list) # Check if current host in nodelist - if it is then in distributed mode. - local_host = socket.gethostname() - distrib_mode = local_host in global_nodelist + distrib_mode = resources.local_host in global_nodelist # If multiple workers per node - create global node_list with N duplicates (for N workers per node) - sub_node_workers = (num_workers >= num_nodes) + sub_node_workers = (num_workers_2assign2 >= num_nodes) if sub_node_workers: - workers_per_node = num_workers//num_nodes + workers_per_node = num_workers_2assign2//num_nodes dup_list = itertools.chain.from_iterable(itertools.repeat(x, workers_per_node) for x in global_nodelist) global_nodelist = list(dup_list) # Currently require even split for distrib mode - to match machinefile - throw away remainder if distrib_mode and not sub_node_workers: - # Could just read in the libe machinefile and use that - but this should match - # Alt. create machinefile/host-list with same algorithm as best_split - future soln. - nodes_per_worker, remainder = divmod(num_nodes, num_workers) + nodes_per_worker, remainder = divmod(num_nodes, num_workers_2assign2) if remainder != 0: # Worker node may not be at head of list after truncation - should perhaps be warning or enforced logger.warning("Nodes to workers not evenly distributed. Wasted nodes. " - "{} workers and {} nodes".format(num_workers, num_nodes)) + "{} workers and {} nodes".format(num_workers_2assign2, num_nodes)) num_nodes = num_nodes - remainder global_nodelist = global_nodelist[0:num_nodes] # Divide global list between workers - split_list = list(Resources.best_split(global_nodelist, num_workers)) + split_list = list(Resources.best_split(global_nodelist, num_workers_2assign2)) logger.debug("split_list is {}".format(split_list)) if workerID is None: raise ResourcesException("Worker has no workerID - aborting") - local_nodelist = split_list[workerID - 1] - logger.debug("local_nodelist is {}".format(local_nodelist)) + if workerID in zero_resource_list: + local_nodelist = [] + logger.debug("Worker is a zero-resource worker") + else: + index = WorkerResources.map_workerid_to_index(num_workers, workerID, zero_resource_list) + local_nodelist = split_list[index] + logger.debug("Worker's local_nodelist is {}".format(local_nodelist)) + return local_nodelist diff --git a/libensemble/sim_funcs/executor_hworld.py b/libensemble/sim_funcs/executor_hworld.py index 08ec89adf..079d3eb21 100644 --- a/libensemble/sim_funcs/executor_hworld.py +++ b/libensemble/sim_funcs/executor_hworld.py @@ -71,26 +71,57 @@ def executor_hworld(H, persis_info, sim_specs, libE_info): cores = sim_specs['user']['cores'] comm = libE_info['comm'] + use_balsam = 'balsam_test' in sim_specs['user'] + args_for_sim = 'sleep 1' # pref send this in X as a sim_in from calling script global sim_count sim_count += 1 timeout = 6.0 + wait = False + launch_shc = False if sim_count == 1: args_for_sim = 'sleep 1' # Should finish elif sim_count == 2: args_for_sim = 'sleep 1 Error' # Worker kill on error - elif sim_count == 3: + if sim_count == 3: + wait = True + args_for_sim = 'sleep 1' # Should finish + launch_shc = True + elif sim_count == 4: args_for_sim = 'sleep 3' # Worker kill on timeout timeout = 1.0 - elif sim_count == 4: - args_for_sim = 'sleep 1 Fail' # Manager kill - if signal received else completes elif sim_count == 5: - args_for_sim = 'sleep 18' # Manager kill - if signal received else completes - timeout = 20.0 + args_for_sim = 'sleep 1 Fail' # Manager kill - if signal received else completes + elif sim_count == 6: + args_for_sim = 'sleep 60' # Manager kill - if signal received else completes + timeout = 65.0 + + if use_balsam: + task = exctr.submit(calc_type='sim', num_procs=cores, app_args=args_for_sim, + hyperthreads=True, machinefile='notused', stdout='notused', + wait_on_run=True) + else: + task = exctr.submit(calc_type='sim', num_procs=cores, app_args=args_for_sim, hyperthreads=True) + + if wait: + task.wait() + if not task.finished: + calc_status = UNSET_TAG + if task.state == 'FINISHED': + calc_status = WORKER_DONE + elif task.state == 'FAILED': + calc_status = TASK_FAILED + + else: + task, calc_status = polling_loop(comm, exctr, task, timeout) - task = exctr.submit(calc_type='sim', num_procs=cores, app_args=args_for_sim, hyperthreads=True) - task, calc_status = polling_loop(comm, exctr, task, timeout) + if use_balsam: + task.read_file_in_workdir('ensemble.log') + try: + task.read_stderr() + except ValueError: + pass # assert task.finished, "task.finished should be True. Returned " + str(task.finished) # assert task.state == 'FINISHED', "task.state should be FINISHED. Returned " + str(task.state) @@ -100,6 +131,13 @@ def executor_hworld(H, persis_info, sim_specs, libE_info): H_o = np.zeros(batch, dtype=sim_specs['out']) for i, x in enumerate(H['x']): H_o['f'][i] = six_hump_camel_func(x) + if launch_shc: + # Test launching a named app. + app_args = ' '.join(str(val) for val in list(x[:])) + task = exctr.submit(app_name='six_hump_camel', num_procs=1, app_args=app_args) + task.wait() + output = np.float64(task.read_stdout()) + assert np.isclose(H_o['f'][i], output) # This is just for testing at calling script level - status of each task H_o['cstat'] = calc_status diff --git a/libensemble/sim_funcs/six_hump_camel.py b/libensemble/sim_funcs/six_hump_camel.py index d992f4db4..5c96fd47d 100644 --- a/libensemble/sim_funcs/six_hump_camel.py +++ b/libensemble/sim_funcs/six_hump_camel.py @@ -5,6 +5,7 @@ # import subprocess import os +import sys import numpy as np import time from libensemble.executors.executor import Executor @@ -156,3 +157,9 @@ def six_hump_camel_grad(x): grad[1] = x1 + 16*x2**3 - 8*x2 return grad + + +if __name__ == "__main__": + x = (float(sys.argv[1]), float(sys.argv[2])) + result = six_hump_camel_func(x) + print(result) diff --git a/libensemble/tests/.coveragerc b/libensemble/tests/.coveragerc index cd55b1320..bed6acb7b 100644 --- a/libensemble/tests/.coveragerc +++ b/libensemble/tests/.coveragerc @@ -14,7 +14,6 @@ omit = */__init__.py */.tox/* */setup.py - */forkpdb.py */unit_tests/* */unit_tests_nompi/* */unit_tests_logger/* diff --git a/libensemble/tests/regression_tests/script_test_balsam_hworld.py b/libensemble/tests/regression_tests/script_test_balsam_hworld.py index 8fc9295c4..fa15257a4 100644 --- a/libensemble/tests/regression_tests/script_test_balsam_hworld.py +++ b/libensemble/tests/regression_tests/script_test_balsam_hworld.py @@ -12,6 +12,7 @@ from libensemble.sim_funcs.executor_hworld import executor_hworld from libensemble.gen_funcs.sampling import uniform_random_sample from libensemble.tools import add_unique_random_streams +import libensemble.sim_funcs.six_hump_camel as six_hump_camel mpi4py.rc.recv_mprobe = False # Disable matching probes @@ -38,14 +39,18 @@ def build_simfunc(): sim_app = './my_simtask.x' if not os.path.isfile(sim_app): build_simfunc() +sim_app2 = six_hump_camel.__file__ -exctr = BalsamMPIExecutor(auto_resources=False) -exctr.register_calc(full_path=sim_app, calc_type='sim') +exctr = BalsamMPIExecutor(auto_resources=False, central_mode=False, custom_info={'not': 'used'}) +exctr.register_calc(full_path=sim_app, calc_type='sim') # Default 'sim' app - backward compatible +exctr.register_calc(full_path=sim_app2, app_name='six_hump_camel') # Named app sim_specs = {'sim_f': executor_hworld, 'in': ['x'], 'out': [('f', float), ('cstat', int)], - 'user': {'cores': cores_per_task}} + 'user': {'cores': cores_per_task, + 'balsam_test': True} + } gen_specs = {'gen_f': uniform_random_sample, 'in': ['sim_id'], @@ -57,7 +62,7 @@ def build_simfunc(): persis_info = add_unique_random_streams({}, nworkers + 1) -exit_criteria = {'elapsed_wallclock_time': 35} +exit_criteria = {'elapsed_wallclock_time': 60} # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, @@ -66,7 +71,7 @@ def build_simfunc(): if is_master: print('\nChecking expected task status against Workers ...\n') calc_status_list_in = np.asarray([WORKER_DONE, WORKER_KILL_ON_ERR, - WORKER_KILL_ON_TIMEOUT, + WORKER_DONE, WORKER_KILL_ON_TIMEOUT, TASK_FAILED, 0]) calc_status_list = np.repeat(calc_status_list_in, nworkers) @@ -78,11 +83,15 @@ def build_simfunc(): # Check summary file: print('Checking expected task status against task summary file ...\n') - calc_desc_list_in = ['Completed', 'Worker killed task on Error', + calc_desc_list_in = ['Completed', 'Worker killed task on Error', 'Completed', 'Worker killed task on Timeout', 'Task Failed', 'Manager killed on finish'] # Repeat N times for N workers and insert Completed at start for generator calc_desc_list = ['Completed'] + calc_desc_list_in*nworkers + # Cleanup (maybe cover del_apps() and del_tasks()) + exctr.del_apps() + exctr.del_tasks() + print("\n\n\nRun completed.") diff --git a/libensemble/tests/regression_tests/support.py b/libensemble/tests/regression_tests/support.py index 446682446..9423443c8 100644 --- a/libensemble/tests/regression_tests/support.py +++ b/libensemble/tests/regression_tests/support.py @@ -17,14 +17,26 @@ def nan_func(calc_in, persis_info, sim_specs, libE_info): return (H, persis_info) -def write_func(calc_in, persis_info, sim_specs, libE_info): +def write_sim_func(calc_in, persis_info, sim_specs, libE_info): out = np.zeros(1, dtype=sim_specs['out']) out['f'] = calc_in['x'] - with open('test_out.txt', 'a') as f: + with open('test_sim_out.txt', 'a') as f: f.write('sim_f received: {}\n'.format(out['f'])) return out, persis_info +def write_uniform_gen_func(H, persis_info, gen_specs, _): + ub = gen_specs['user']['ub'] + lb = gen_specs['user']['lb'] + n = len(lb) + b = gen_specs['user']['gen_batch_size'] + H_o = np.zeros(b, dtype=gen_specs['out']) + H_o['x'] = persis_info['rand_stream'].uniform(lb, ub, (b, n)) + with open('test_gen_out.txt', 'a') as f: + f.write('gen_f produced: {}\n'.format(H_o['x'])) + return H_o, persis_info + + uniform_or_localopt_gen_out = [('priority', float), ('local_pt', bool), ('known_to_aposmm', bool), diff --git a/libensemble/tests/regression_tests/test_deap_nsga2.py b/libensemble/tests/regression_tests/test_deap_nsga2.py index 210399e47..54e99f6c0 100644 --- a/libensemble/tests/regression_tests/test_deap_nsga2.py +++ b/libensemble/tests/regression_tests/test_deap_nsga2.py @@ -4,7 +4,7 @@ # """ # Do not change these lines - they are parsed by run-tests.sh -# TESTSUITE_COMMS: mpi local tcp +# TESTSUITE_COMMS: mpi local # TESTSUITE_NPROCS: 3 4 import numpy as np @@ -52,7 +52,7 @@ def deap_six_hump(H, persis_info, sim_specs, _): # State the generating function, its arguments, output, and necessary parameters. gen_specs = {'gen_f': gen_f, - 'in': ['sim_id'], + 'in': ['sim_id', 'generation', 'individual', 'fitness_values'], 'out': [('individual', float, ind_size), ('generation', int)], 'user': {'lb': lb, 'ub': ub, @@ -66,19 +66,45 @@ def deap_six_hump(H, persis_info, sim_specs, _): # libE Allocation function alloc_specs = {'out': [('given_back', bool)], 'alloc_f': alloc_f} -persis_info = add_unique_random_streams({}, nworkers + 1) # Tell libEnsemble when to stop # 'sim_max' = number of simulation calls # For deap, this should be pop_size*number of generations+1 exit_criteria = {'sim_max': pop_size*(ngen+1)} +for run in range(2): -# Perform the run -H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) + persis_info = add_unique_random_streams({}, nworkers + 1) -if is_master: - script_name = os.path.splitext(os.path.basename(__file__))[0] - assert flag == 0, script_name + " didn't exit correctly" - assert sum(H['returned']) >= exit_criteria['sim_max'], script_name + " didn't evaluate the sim_max points." - assert min(H['fitness_values']) <= -1.0315, script_name + " didn't find the global minimum for this problem." + if run == 1: + # Test loading in a previous set of (x,f)-pairs, or (individual, fitness_values)-pairs + + # Number of points in the sample + num_samp = 100 + + H0 = np.zeros(num_samp, dtype=[('individual', float, ind_size), ('generation', int), ('fitness_values', float), + ('sim_id', int), ('returned', bool), ('given_back', bool), ('given', bool)]) + + # Mark these points as already have been given to be evaluated, and returned, but not given_back. + H0[['given', 'given_back', 'returned']] = True + H0['generation'][:] = 1 + # Give these points sim_ids + H0['sim_id'] = range(num_samp) + + # "Load in" the points and their function values. (In this script, we are + # actually evaluating them, but in many cases, they are available from past + # evaluations + H0['individual'] = np.random.uniform(lb, ub, (num_samp, len(lb))) + for i, x in enumerate(H0['individual']): + H0['fitness_values'][i] = six_hump_camel_func(x) + else: + H0 = None + + # Perform the run + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0=H0) + + if is_master: + script_name = os.path.splitext(os.path.basename(__file__))[0] + assert flag == 0, script_name + " didn't exit correctly" + assert sum(H['returned']) >= exit_criteria['sim_max'], script_name + " didn't evaluate the sim_max points." + assert min(H['fitness_values']) <= -1.0315, script_name + " didn't find the global minimum for this problem." diff --git a/libensemble/tests/regression_tests/test_executor_hworld.py b/libensemble/tests/regression_tests/test_executor_hworld.py index fc7d5a66b..7c5c1f154 100644 --- a/libensemble/tests/regression_tests/test_executor_hworld.py +++ b/libensemble/tests/regression_tests/test_executor_hworld.py @@ -20,6 +20,7 @@ from libensemble.message_numbers import WORKER_DONE, WORKER_KILL_ON_ERR, WORKER_KILL_ON_TIMEOUT, TASK_FAILED from libensemble.libE import libE from libensemble.sim_funcs.executor_hworld import executor_hworld as sim_f +import libensemble.sim_funcs.six_hump_camel as six_hump_camel from libensemble.gen_funcs.sampling import uniform_random_sample as gen_f from libensemble.tools import parse_args, add_unique_random_streams from libensemble.tests.regression_tests.common import build_simfunc @@ -52,6 +53,7 @@ sim_app = './my_simtask.x' if not os.path.isfile(sim_app): build_simfunc() +sim_app2 = six_hump_camel.__file__ if USE_BALSAM: from libensemble.executors.balsam_executor import BalsamMPIExecutor @@ -59,7 +61,8 @@ else: from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor(auto_resources=use_auto_resources) -exctr.register_calc(full_path=sim_app, calc_type='sim') +exctr.register_calc(full_path=sim_app, calc_type='sim') # Default 'sim' app - backward compatible +exctr.register_calc(full_path=sim_app2, app_name='six_hump_camel') # Named app # if nworkers == 3: # CalcInfo.keep_worker_stat_files = True # Testing this functionality @@ -83,7 +86,7 @@ persis_info = add_unique_random_streams({}, nworkers + 1) -exit_criteria = {'elapsed_wallclock_time': 10} +exit_criteria = {'elapsed_wallclock_time': 20} # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, @@ -96,7 +99,8 @@ # manager kill - but should show in the summary file. # Repeat expected lists nworkers times and compare with list of status's # received from workers - calc_status_list_in = np.asarray([WORKER_DONE, WORKER_KILL_ON_ERR, WORKER_KILL_ON_TIMEOUT, TASK_FAILED, 0]) + calc_status_list_in = np.asarray([WORKER_DONE, WORKER_KILL_ON_ERR, WORKER_DONE, + WORKER_KILL_ON_TIMEOUT, TASK_FAILED, 0]) calc_status_list = np.repeat(calc_status_list_in, nworkers) # For debug @@ -108,7 +112,7 @@ # Check summary file: print('Checking expected task status against task summary file ...\n') - calc_desc_list_in = ['Completed', 'Worker killed task on Error', + calc_desc_list_in = ['Completed', 'Worker killed task on Error', 'Completed', 'Worker killed task on Timeout', 'Task Failed', 'Manager killed on finish'] diff --git a/libensemble/tests/regression_tests/test_fast_alloc.py b/libensemble/tests/regression_tests/test_fast_alloc.py index 78a68e503..35099f37d 100644 --- a/libensemble/tests/regression_tests/test_fast_alloc.py +++ b/libensemble/tests/regression_tests/test_fast_alloc.py @@ -28,7 +28,7 @@ num_pts = 30*(nworkers - 1) -sim_specs = {'sim_f': sim_f, 'in': ['x'], 'out': [('f', float)], 'user': {}} +sim_specs = {'sim_f': sim_f, 'in': ['x'], 'out': [('f', float), ('large', float, 1000000)], 'user': {}} gen_specs = {'gen_f': gen_f, 'in': ['sim_id'], @@ -40,7 +40,7 @@ persis_info = add_unique_random_streams({}, nworkers + 1) -exit_criteria = {'sim_max': num_pts, 'elapsed_wallclock_time': 300} +exit_criteria = {'sim_max': 2*num_pts, 'elapsed_wallclock_time': 300} if libE_specs['comms'] == 'tcp': # Can't use the same interface for manager and worker if we want @@ -69,4 +69,4 @@ if is_master: assert flag == 0 - assert len(H) == num_pts + assert len(H) == 2*num_pts diff --git a/libensemble/tests/regression_tests/test_mpi_runners.py b/libensemble/tests/regression_tests/test_mpi_runners.py index 26ec1a92a..0dcffe808 100644 --- a/libensemble/tests/regression_tests/test_mpi_runners.py +++ b/libensemble/tests/regression_tests/test_mpi_runners.py @@ -69,13 +69,13 @@ def runline_check(H, persis_info, sim_specs, libE_info): dry_run=True) outline = task.runline - exp_list[i] = exp_nodelist_for_worker(exp_list[i], libE_info['workerID']) + new_exp_list = exp_nodelist_for_worker(exp_list[i], libE_info['workerID']) - if outline != exp_list[i]: + if outline != new_exp_list: print('outline is: {}'.format(outline), flush=True) - print('exp is: {}'.format(exp_list[i]), flush=True) + print('exp is: {}'.format(new_exp_list), flush=True) - assert(outline == exp_list[i]) + assert(outline == new_exp_list) calc_status = WORKER_DONE output = np.zeros(1, dtype=sim_specs['out']) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py index 26379b6f8..06b006a56 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py @@ -66,7 +66,7 @@ persis_info = add_unique_random_streams({}, nworkers + 1) -exit_criteria = {'sim_max': 10000} +exit_criteria = {'sim_max': 1000} # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_scipy.py b/libensemble/tests/regression_tests/test_persistent_aposmm_scipy.py index 9017c7e98..516739296 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_scipy.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_scipy.py @@ -94,3 +94,22 @@ assert min_found >= 4, "Found {} minima".format(min_found) save_libE_output(H, persis_info, __file__, nworkers) + +# Now let's run on the same problem with a really large n (but we won't test +# convergence to all local min). Note that sim_f uses only entries x[0:2] +n = 400 +persis_info = add_unique_random_streams({}, nworkers + 1) +gen_specs['out'][0:2] = [('x', float, n), ('x_on_cube', float, n)] +gen_specs['user']['lb'] = np.zeros(n) +gen_specs['user']['ub'] = np.ones(n) +gen_specs['user']['lb'][:2] = [-3, -2] +gen_specs['user']['ub'][:2] = [3, 2] +gen_specs['user']['rk_const'] = 4.90247 +gen_specs['user'].pop('sample_points') +gen_specs['user']['localopt_method'] = 'scipy_Nelder-Mead' +sim_specs['out'] = [('f', float)] + +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) + +if is_master: + assert np.sum(H['returned']) >= exit_criteria['sim_max'], "Run didn't finish" diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py b/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py index ecfc32b86..7a42cc9de 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py @@ -43,6 +43,7 @@ 'localopt_method': 'LN_BOBYQA', 'xtol_abs': 1e-8, 'ftol_abs': 1e-8, + 'run_max_eval': 30, 'lb': np.array([0, -np.pi/2]), 'ub': np.array([2*np.pi, 3*np.pi/2]), 'periodic': True, diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py index bd40c371f..7ad11bb2a 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py @@ -53,6 +53,7 @@ 'user': {'initial_sample_size': 0, # Don't need to do evaluations because the sampling already done below 'localopt_method': 'LD_MMA', 'rk_const': 0.5*((gamma(1+(n/2))*5)**(1/n))/sqrt(pi), + 'stop_after_this_many_minima': 25, 'xtol_rel': 1e-6, 'ftol_rel': 1e-6, 'max_active_runs': 6, @@ -99,4 +100,6 @@ print(np.min(np.sum((H[H['local_min']]['x'] - m)**2, 1)), flush=True) assert np.min(np.sum((H[H['local_min']]['x'] - m)**2, 1)) < tol + assert len(H) < exit_criteria['sim_max'], "Test should have stopped early" + save_libE_output(H, persis_info, __file__, nworkers) diff --git a/libensemble/tests/regression_tests/test_sim_dirs_per_calc.py b/libensemble/tests/regression_tests/test_sim_dirs_per_calc.py index 545d4f851..a2746940c 100644 --- a/libensemble/tests/regression_tests/test_sim_dirs_per_calc.py +++ b/libensemble/tests/regression_tests/test_sim_dirs_per_calc.py @@ -18,7 +18,7 @@ import os from libensemble.libE import libE -from libensemble.tests.regression_tests.support import write_func as sim_f +from libensemble.tests.regression_tests.support import write_sim_func as sim_f from libensemble.gen_funcs.sampling import uniform_random_sample as gen_f from libensemble.tools import parse_args, add_unique_random_streams diff --git a/libensemble/tests/regression_tests/test_sim_dirs_per_worker.py b/libensemble/tests/regression_tests/test_sim_dirs_per_worker.py index dbe017391..01301254b 100644 --- a/libensemble/tests/regression_tests/test_sim_dirs_per_worker.py +++ b/libensemble/tests/regression_tests/test_sim_dirs_per_worker.py @@ -18,7 +18,7 @@ import os from libensemble.libE import libE -from libensemble.tests.regression_tests.support import write_func as sim_f +from libensemble.tests.regression_tests.support import write_sim_func as sim_f from libensemble.gen_funcs.sampling import uniform_random_sample as gen_f from libensemble.tools import parse_args, add_unique_random_streams diff --git a/libensemble/tests/regression_tests/test_sim_dirs_with_exception.py b/libensemble/tests/regression_tests/test_sim_dirs_with_exception.py index 3d9d594ff..2af0c849c 100644 --- a/libensemble/tests/regression_tests/test_sim_dirs_with_exception.py +++ b/libensemble/tests/regression_tests/test_sim_dirs_with_exception.py @@ -18,7 +18,7 @@ import os from libensemble.libE import libE -from libensemble.tests.regression_tests.support import write_func as sim_f +from libensemble.tests.regression_tests.support import write_sim_func as sim_f from libensemble.gen_funcs.sampling import uniform_random_sample as gen_f from libensemble.tools import parse_args, add_unique_random_streams from libensemble.libE_manager import ManagerException diff --git a/libensemble/tests/regression_tests/test_sim_dirs_with_gen_dirs.py b/libensemble/tests/regression_tests/test_sim_dirs_with_gen_dirs.py new file mode 100644 index 000000000..7759d9e12 --- /dev/null +++ b/libensemble/tests/regression_tests/test_sim_dirs_with_gen_dirs.py @@ -0,0 +1,102 @@ +# """ +# Runs libEnsemble with uniform random sampling and writes results into sim dirs. +# tests per-calculation sim_dir capabilities +# +# Execute via one of the following commands (e.g. 3 workers): +# mpiexec -np 4 python3 test_worker_exceptions.py +# python3 test_worker_exceptions.py --nworkers 3 --comms local +# python3 test_worker_exceptions.py --nworkers 3 --comms tcp +# +# The number of concurrent evaluations of the objective function will be 4-1=3. +# """ + +# Do not change these lines - they are parsed by run-tests.sh +# TESTSUITE_COMMS: mpi local tcp +# TESTSUITE_NPROCS: 2 4 + +import numpy as np +import os + +from libensemble.libE import libE +from libensemble.tests.regression_tests.support import write_sim_func as sim_f +from libensemble.tests.regression_tests.support import write_uniform_gen_func as gen_f +from libensemble.tools import parse_args, add_unique_random_streams + +nworkers, is_master, libE_specs, _ = parse_args() + +sim_input_dir = './sim_input_dir' +dir_to_copy_sim = sim_input_dir + '/copy_this_sim' +dir_to_symlink_sim = sim_input_dir + '/symlink_this_sim' + +gen_input_dir = './gen_input_dir' +dir_to_copy_gen = gen_input_dir + '/copy_this_gen' +dir_to_symlink_gen = gen_input_dir + '/symlink_this_gen' + +c_ensemble = './ensemble_combined_calcdirs_w' + str(nworkers) + '_' + libE_specs.get('comms') +print('creating ensemble dir: ', c_ensemble, flush=True) + +for dir in [sim_input_dir, dir_to_copy_sim, dir_to_symlink_sim, + gen_input_dir, dir_to_copy_gen, dir_to_symlink_gen]: + if is_master and not os.path.isdir(dir): + os.makedirs(dir, exist_ok=True) + +libE_specs['sim_dirs_make'] = True +libE_specs['gen_dirs_make'] = True + +libE_specs['ensemble_dir_path'] = c_ensemble +libE_specs['use_worker_dirs'] = False + +libE_specs['sim_dir_copy_files'] = [dir_to_copy_sim] +libE_specs['sim_dir_symlink_files'] = [dir_to_symlink_sim] + +libE_specs['gen_dir_copy_files'] = [dir_to_copy_gen] +libE_specs['gen_dir_symlink_files'] = [dir_to_symlink_gen] + +libE_specs['ensemble_copy_back'] = True + +sim_specs = {'sim_f': sim_f, 'in': ['x'], 'out': [('f', float)]} + +gen_specs = {'gen_f': gen_f, + 'out': [('x', float, (1,))], + 'user': {'gen_batch_size': 20, + 'lb': np.array([-3]), + 'ub': np.array([3]), + } + } + +persis_info = add_unique_random_streams({}, nworkers + 1) + +exit_criteria = {'sim_max': 21} + +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, + persis_info, libE_specs=libE_specs) + + +def check_copied(type): + input_copied = [] + for base, files, _ in os.walk(c_ensemble): + basedir = base.split('/')[-1] + if basedir.startswith(type): + input_copied.append(all([os.path.basename(j) in files for j in + libE_specs[type + '_dir_copy_files'] + + libE_specs[type + '_dir_symlink_files']])) + + assert all(input_copied), \ + 'All input files not copied or symlinked to each {} calc dir'.format(type) + + +if is_master: + assert os.path.isdir(c_ensemble), 'Ensemble directory {} not created.'.format(c_ensemble) + sim_dir_sum = sum(['sim' in i for i in os.listdir(c_ensemble)]) + assert sim_dir_sum == exit_criteria['sim_max'], \ + 'Number of sim directories ({}) does not match sim_max ({}).' \ + .format(sim_dir_sum, exit_criteria['sim_max']) + + assert any(['gen' in i for i in os.listdir(c_ensemble)]), \ + 'No gen directories created.' + + check_copied('sim') + check_copied('gen') + + assert all([i in os.listdir(c_ensemble) for i in os.listdir(c_ensemble+'_back')]), \ + "Copyback dir doesn't contain the same contents as ensemble dir" diff --git a/libensemble/tests/regression_tests/test_sim_input_dir_option.py b/libensemble/tests/regression_tests/test_sim_input_dir_option.py index aef950256..95e2dfaa4 100644 --- a/libensemble/tests/regression_tests/test_sim_input_dir_option.py +++ b/libensemble/tests/regression_tests/test_sim_input_dir_option.py @@ -18,7 +18,7 @@ import os from libensemble.libE import libE -from libensemble.tests.regression_tests.support import write_func as sim_f +from libensemble.tests.regression_tests.support import write_sim_func as sim_f from libensemble.gen_funcs.sampling import uniform_random_sample as gen_f from libensemble.tools import parse_args, add_unique_random_streams @@ -36,6 +36,7 @@ libE_specs['ensemble_dir_path'] = o_ensemble libE_specs['sim_dirs_make'] = False libE_specs['sim_dir_symlink_files'] = ['./test_sim_input_dir_option.py'] # to cover FileExistsError catch +libE_specs['ensemble_copy_back'] = True sim_specs = {'sim_f': sim_f, 'in': ['x'], 'out': [('f', float)]} @@ -59,7 +60,7 @@ 'Ensemble directory {} not created.'.format(o_ensemble) assert os.path.basename(dir_to_copy) in os.listdir(o_ensemble), \ 'Input file not copied over.' - with open(os.path.join(o_ensemble, 'test_out.txt'), 'r') as f: + with open(os.path.join(o_ensemble, 'test_sim_out.txt'), 'r') as f: lines = f.readlines() assert len(lines) == exit_criteria['sim_max'], \ diff --git a/libensemble/tests/regression_tests/test_vtmop.py b/libensemble/tests/regression_tests/test_vtmop.py index 35d03fe5e..994e29bb4 100644 --- a/libensemble/tests/regression_tests/test_vtmop.py +++ b/libensemble/tests/regression_tests/test_vtmop.py @@ -23,6 +23,9 @@ # TESTSUITE_NPROCS: import numpy as np +import os +import time +from libensemble.utils.timer import Timer # Import libEnsemble items for this test from libensemble.libE import libE @@ -32,6 +35,8 @@ from libensemble.alloc_funcs.only_one_gen_alloc import ensure_one_active_gen as alloc_f from libensemble.tools import parse_args, save_libE_output, add_unique_random_streams +timer = Timer() + # Set the problem dimensions here num_dims = 5 num_objs = 3 @@ -66,18 +71,18 @@ def sim_f(H, *unused): 'in': ['x', 'f'], 'out': [('x', float, num_dims)], 'user': { - # Set the number of objectives. The number of design variables is - # inferred based on the length of lb. + # Set the number of objectives. The number of design variables + # is inferred based on the length of lb. 'num_obj': num_objs, # Set the bound constraints. 'lb': lower_bounds, 'ub': upper_bounds, # search_batch_size is the number of points used to search # each local trust region (using Latin hypercube design). - # This should be a multiple of the number of concurrent function - # evaluations and on the order of 2*d (where d is the number of - # design variables) - 'search_batch_size': int(np.ceil(2*num_dims/nworkers)*nworkers), + # This should be a multiple of the number of concurrent + # function evaluations and on the order of 4*d (where d is + # the number of design variables) + 'search_batch_size': int(np.ceil(4*num_dims/nworkers)*nworkers), # opt_batch_size is the preferred number of candidate designs. # When the actual number of candidates is not a multiple of # opt_batch_size, additional candidates are randomly generated @@ -92,55 +97,101 @@ def sim_f(H, *unused): # initial database will cause an error since the surrogates # cannot be fit without sufficient data. 'first_batch_size': 1000, - # Set the trust region radius. This setting is problem - # dependent. A good starting place would be between 10% and - # 25% of the median edge length of the bounding box (err on - # the smaller side when the number of design variables is - # greater than 5 or 6). - 'trust_rad': np.median(upper_bounds - lower_bounds)*0.1, - # Are you reloading from a checkpoint + # Set the trust region radius as a fraction of ub[:]-lb[:]. + # This setting is problem dependent. A good starting place + # would be between 0.1 and 0.2. + 'trust_rad': 0.1, + # Are you reloading from a checkpoint? 'use_chkpt': False}, } # Set up the allocator alloc_specs = {'alloc_f': alloc_f, 'out': []} -for run in range(2): - if run == 1: - # In the second run, we initialize VTMOP with an initial sample: +s1 = [] +H = [] + +for run in range(3): + if run == 0: + # Run for 1100 evaluations or 300 seconds + H0 = None + exit_criteria = {'sim_max': 1100, 'elapsed_wallclock_time': 300} + + elif run == 1: + # In the second run, we initialize VTMOP with an initial sample of previously evaluated points np.random.seed(0) - sample_size = 1000 - X = np.random.uniform(gen_specs['user']['lb'], gen_specs['user']['ub'], (sample_size, num_dims)) - f = np.zeros((sample_size, num_objs)) + size = 1000 - H0 = np.zeros(sample_size, dtype=[('x', float, num_dims), ('f', float, num_objs), ('sim_id', int), - ('returned', bool), ('given', bool)]) + # Generate the sample + X = np.random.uniform(gen_specs['user']['lb'], gen_specs['user']['ub'], (size, num_dims)) + f = np.zeros((size, num_objs)) + + # Initialize H0 + H0 = np.zeros(size, dtype=[('x', float, num_dims), ('f', float, num_objs), ('sim_id', int), + ('returned', bool), ('given', bool)]) H0['x'] = X - H0['sim_id'] = range(sample_size) + H0['sim_id'] = range(size) H0[['given', 'returned']] = True - for i in range(sample_size): + # Perform objective function evaluations + for i in range(size): Out, _ = sim_f(H0[[i]]) H0['f'][i] = Out['f'] - gen_specs['user']['use_chkpt'] = True + # Run for 200 more evaluations or 300 seconds + exit_criteria = {'sim_max': 200, 'elapsed_wallclock_time': 300} + gen_specs['user']['first_batch_size'] = 0 - else: - H0 = None + gen_specs['user']['use_chkpt'] = False # Need to set this as it can be overwritten within the libE call. + + elif run == 2: + # In the third run, we restart VTMOP by loading in the history array saved in run==1 + gen_specs['user']['use_chkpt'] = True + + # Inelegant way to have the manager copy over the VTMOP checkpoint + # file, and have every worker get the H value from the run==1 case to + # use in the restart. + try: + os.remove('manager_done_file') + except OSError: + pass + + if is_master: + os.rename('vtmop.chkpt_finishing_' + s1, 'vtmop.chkpt') + np.save('H_for_vtmop_restart.npy', H) + open('manager_done_file', 'w').close() + else: + while not os.path.isfile('manager_done_file'): + time.sleep(0.1) + H = np.load('H_for_vtmop_restart.npy') + + # Initialize H0 with values from H (from the run==1 case) + size = sum(H['returned']) + H0 = np.zeros(size, dtype=[('x', float, num_dims), ('f', float, num_objs), ('sim_id', int), + ('returned', bool), ('given', bool)]) + H0['x'] = H['x'][:size] + H0['sim_id'] = range(size) + H0[['given', 'returned']] = True + H0['f'] = H['f'][:size] + + # Run for 200 more evaluations or 300 seconds + exit_criteria = {'sim_max': 200, 'elapsed_wallclock_time': 300} # Persistent info between iterations persis_info = add_unique_random_streams({}, nworkers + 1) - persis_info['next_to_give'] = 0 + persis_info['next_to_give'] = 0 if H0 is None else len(H0) persis_info['total_gen_calls'] = 0 - # Run for 2000 evaluations or 300 seconds - exit_criteria = {'sim_max': 1100, 'elapsed_wallclock_time': 300} - # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs=alloc_specs, libE_specs=libE_specs, H0=H0) - # The master takes care of checkpointint/output + # The master takes care of checkpointing/output if is_master: + # Renaming vtmop checkpointing file, if needed for later use. + timer.start() + s1 = timer.date_start.replace(' ', '_') + os.rename('vtmop.chkpt', 'vtmop.chkpt_finishing_' + s1) + assert flag == 0 save_libE_output(H, persis_info, __file__, nworkers) diff --git a/libensemble/tests/regression_tests/test_zero_resource_workers.py b/libensemble/tests/regression_tests/test_zero_resource_workers.py new file mode 100644 index 000000000..0410e50ef --- /dev/null +++ b/libensemble/tests/regression_tests/test_zero_resource_workers.py @@ -0,0 +1,170 @@ +# """ +# Runs libEnsemble testing the zero_resource_workers argument. +# +# Execute via one of the following commands (e.g. 3 workers): +# mpiexec -np 4 python3 test_zero_resource_workers.py +# python3 test_zero_resource_workers.py --nworkers 3 --comms local +# python3 test_zero_resource_workers.py --nworkers 3 --comms tcp +# +# The number of concurrent evaluations of the objective function will be 4-1=3. +# """ + +import os +import sys +import numpy as np + +from libensemble.message_numbers import WORKER_DONE +from libensemble.libE import libE +from libensemble.gen_funcs.persistent_uniform_sampling import persistent_uniform as gen_f +from libensemble.alloc_funcs.start_only_persistent import only_persistent_gens as alloc_f +from libensemble.tools import parse_args, add_unique_random_streams +from libensemble.executors.mpi_executor import MPIExecutor +from libensemble import libE_logger + +# libE_logger.set_level('DEBUG') # For testing the test +libE_logger.set_level('INFO') + +# Do not change these lines - they are parsed by run-tests.sh +# TESTSUITE_COMMS: mpi local +# TESTSUITE_NPROCS: 3 4 + +nodes_per_worker = 2 + + +def exp_nodelist_for_worker(exp_list, workerID): + """Modify expected node-lists based on workerID""" + comps = exp_list.split() + new_line = [] + for comp in comps: + if comp.startswith('node-'): + new_node_list = [] + node_list = comp.split(',') + for node in node_list: + node_name, node_num = node.split('-') + new_num = int(node_num) + nodes_per_worker*(workerID - 2) # For 1 persistent gen + new_node = '-'.join([node_name, str(new_num)]) + new_node_list.append(new_node) + new_list = ','.join(new_node_list) + new_line.append(new_list) + else: + new_line.append(comp) + return ' '.join(new_line) + + +def runline_check(H, persis_info, sim_specs, libE_info): + """Check run-lines produced by executor provided by a list""" + calc_status = 0 + x = H['x'][0][0] + exctr = MPIExecutor.executor + test_list = sim_specs['user']['tests'] + exp_list = sim_specs['user']['expect'] + + for i, test in enumerate(test_list): + task = exctr.submit(calc_type='sim', + num_procs=test.get('nprocs', None), + num_nodes=test.get('nnodes', None), + ranks_per_node=test.get('ppn', None), + extra_args=test.get('e_args', None), + app_args='--testid ' + test.get('testid', None), + stdout='out.txt', + stderr='err.txt', + hyperthreads=test.get('ht', None), + dry_run=True) + + outline = task.runline + new_exp_list = exp_nodelist_for_worker(exp_list[i], libE_info['workerID']) + + if outline != new_exp_list: + print('outline is: {}\nexp is: {}'.format(outline, new_exp_list), flush=True) + + assert(outline == new_exp_list) + + calc_status = WORKER_DONE + output = np.zeros(1, dtype=sim_specs['out']) + output['f'][0] = np.linalg.norm(x) + return output, persis_info, calc_status + +# -------------------------------------------------------------------- + + +nworkers, is_master, libE_specs, _ = parse_args() +rounds = 1 +sim_app = '/path/to/fakeapp.x' +comms = libE_specs['comms'] +libE_specs['zero_resource_workers'] = [1] + + +# To allow visual checking - log file not used in test +log_file = 'ensemble_zrw_comms_' + str(comms) + '_wrks_' + str(nworkers) + '.log' +libE_logger.set_filename(log_file) + +# For varying size test - relate node count to nworkers +in_place = libE_specs['zero_resource_workers'] +nsim_workers = nworkers-len(in_place) +comms = libE_specs['comms'] +nodes_per_worker = 2 +node_file = 'nodelist_zero_resource_workers_' + str(comms) + '_wrks_' + str(nworkers) +if is_master: + if os.path.exists(node_file): + os.remove(node_file) + with open(node_file, 'w') as f: + for i in range(1, (nsim_workers)*nodes_per_worker + 1): + f.write('node-' + str(i) + '\n') + f.flush() + os.fsync(f) +if comms == 'mpi': + libE_specs['comm'].Barrier() + + +# Mock up system +customizer = {'mpi_runner': 'mpich', # Select runner: mpich, openmpi, aprun, srun, jsrun + 'runner_name': 'mpirun', # Runner name: Replaces run command if not None + 'cores_on_node': (16, 64), # Tuple (physical cores, logical cores) + 'node_file': node_file} # Name of file containing a node-list + +# Create executor and register sim to it. +exctr = MPIExecutor(zero_resource_workers=in_place, central_mode=True, auto_resources=True, custom_info=customizer) +exctr.register_calc(full_path=sim_app, calc_type='sim') + + +if nworkers < 2: + sys.exit("Cannot run with a persistent worker if only one worker -- aborting...") + +n = 2 +sim_specs = {'sim_f': runline_check, + 'in': ['x'], + 'out': [('f', float)], + } + +gen_specs = {'gen_f': gen_f, + 'in': [], + 'out': [('x', float, (n,))], + 'user': {'gen_batch_size': 20, + 'lb': np.array([-3, -2]), + 'ub': np.array([3, 2])} + } + +alloc_specs = {'alloc_f': alloc_f, 'out': [('given_back', bool)]} +persis_info = add_unique_random_streams({}, nworkers + 1) +exit_criteria = {'sim_max': (nsim_workers)*rounds} + +# Each worker has 2 nodes. Basic test list for portable options +test_list_base = [{'testid': 'base1', 'nprocs': 2, 'nnodes': 1, 'ppn': 2, 'e_args': '--xarg 1'}, # Under use + {'testid': 'base2'}, # Give no config and no extra_args + ] + +exp_mpich = \ + ['mpirun -hosts node-1 -np 2 --ppn 2 --xarg 1 /path/to/fakeapp.x --testid base1', + 'mpirun -hosts node-1,node-2 -np 32 --ppn 16 /path/to/fakeapp.x --testid base2', + ] + +test_list = test_list_base +exp_list = exp_mpich +sim_specs['user'] = {'tests': test_list, 'expect': exp_list} + + +# Perform the run +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, + alloc_specs, libE_specs) + +# All asserts are in sim func diff --git a/libensemble/tests/run-tests.sh b/libensemble/tests/run-tests.sh index c500b08fc..f63d94fa8 100755 --- a/libensemble/tests/run-tests.sh +++ b/libensemble/tests/run-tests.sh @@ -9,10 +9,11 @@ export RUN_UNIT_TESTS=true #Recommended for pre-push / CI tests export RUN_COV_TESTS=true #Provide coverage report export RUN_REG_TESTS=true #Recommended for pre-push / CI tests export RUN_PEP_TESTS=false #Code syle conventions +export PYTHON_FLAGS='' #Flags for PYTHON_RUN # Regression test options #export REG_TEST_LIST='test_number1.py test_number2.py' #selected/ordered -export REG_TEST_LIST=test_*.py #unordered +export REG_TEST_LIST=test_*.py #unordered # override with -y # export REG_TEST_PROCESS_COUNT_LIST='2 4' export REG_USE_PYTEST=false export REG_TEST_OUTPUT_EXT=std.out #/dev/null @@ -127,39 +128,42 @@ cleanup() { THISDIR=${PWD} cd $ROOT_DIR/$TESTING_DIR filelist=(.cov_merge_out*); [ -e ${filelist[0]} ] && rm .cov_merge_out* + filelist=(ensemble_*); [ -e ${filelist[0]} ] && rm -r ensemble_* for DIR in $UNIT_TEST_SUBDIR $UNIT_TEST_NOMPI_SUBDIR $UNIT_TEST_LOGGER_SUBDIR ; do cd $ROOT_DIR/$DIR - filelist=(libE_history_at_abort_*.npy); [ -e ${filelist[0]} ] && rm libE_history_at_abort_*.npy - filelist=(*.out); [ -e ${filelist[0]} ] && rm *.out - filelist=(*.err); [ -e ${filelist[0]} ] && rm *.err - filelist=(*.pickle); [ -e ${filelist[0]} ] && rm *.pickle - filelist=(.cov_unit_out*); [ -e ${filelist[0]} ] && rm .cov_unit_out* - filelist=(my_simtask.x); [ -e ${filelist[0]} ] && rm my_simtask.x - filelist=(task_my_simtask.x*.out); [ -e ${filelist[0]} ] && rm task_my_simtask.x*.out - filelist=(*libe_summary.txt*); [ -e ${filelist[0]} ] && rm *libe_summary.txt* - filelist=(*libE_stats.txt*); [ -e ${filelist[0]} ] && rm *libE_stats.txt* - filelist=(my_machinefile); [ -e ${filelist[0]} ] && rm my_machinefile - filelist=(libe_stat_files); [ -e ${filelist[0]} ] && rm -r libe_stat_files - filelist=(ensemble.log); [ -e ${filelist[0]} ] && rm ensemble.log + filelist=(libE_history_at_abort_*.npy); [ -e ${filelist[0]} ] && rm libE_history_at_abort_*.npy + filelist=(*.out); [ -e ${filelist[0]} ] && rm *.out + filelist=(*.err); [ -e ${filelist[0]} ] && rm *.err + filelist=(*.pickle); [ -e ${filelist[0]} ] && rm *.pickle + filelist=(.cov_unit_out*); [ -e ${filelist[0]} ] && rm .cov_unit_out* + filelist=(my_simtask.x); [ -e ${filelist[0]} ] && rm my_simtask.x + filelist=(libe_task_*.out); [ -e ${filelist[0]} ] && rm libe_task_*.out + filelist=(*libE_stats.txt*); [ -e ${filelist[0]} ] && rm *libE_stats.txt* + filelist=(my_machinefile); [ -e ${filelist[0]} ] && rm my_machinefile + filelist=(libe_stat_files); [ -e ${filelist[0]} ] && rm -r libe_stat_files + filelist=(ensemble.log); [ -e ${filelist[0]} ] && rm ensemble.log + filelist=(H_test.npy); [ -e ${filelist[0]} ] && rm H_test.npy done cd $ROOT_DIR/$REG_TEST_SUBDIR - filelist=(*.$REG_TEST_OUTPUT_EXT); [ -e ${filelist[0]} ] && rm *.$REG_TEST_OUTPUT_EXT - filelist=(*.npy); [ -e ${filelist[0]} ] && rm *.npy - filelist=(*.pickle); [ -e ${filelist[0]} ] && rm *.pickle - filelist=(.cov_reg_out*); [ -e ${filelist[0]} ] && rm .cov_reg_out* - filelist=(*active_runs.txt); [ -e ${filelist[0]} ] && rm *active_runs.txt - filelist=(*.err); [ -e ${filelist[0]} ] && rm *.err - filelist=(outfile*.txt); [ -e ${filelist[0]} ] && rm outfile*.txt - filelist=(machinefile*); [ -e ${filelist[0]} ] && rm machinefile* - filelist=(task_my_simtask.x.*.out); [ -e ${filelist[0]} ] && rm task_my_simtask.x.*.out - filelist=(*libe_summary.txt*); [ -e ${filelist[0]} ] && rm *libe_summary.txt* - filelist=(*libE_stats.txt*); [ -e ${filelist[0]} ] && rm *libE_stats.txt* - filelist=(my_simtask.x); [ -e ${filelist[0]} ] && rm my_simtask.x - filelist=(libe_stat_files); [ -e ${filelist[0]} ] && rm -r libe_stat_files - filelist=(ensemble.log); [ -e ${filelist[0]} ] && rm ensemble.log - filelist=(ensemble_*); [ -e ${filelist[0]} ] && rm -r ensemble_* - filelist=(sim_*); [ -e ${filelist[0]} ] && rm -r sim_* - filelist=(nodelist_*); [ -e ${filelist[0]} ] && rm nodelist_* + filelist=(*.$REG_TEST_OUTPUT_EXT); [ -e ${filelist[0]} ] && rm *.$REG_TEST_OUTPUT_EXT + filelist=(*.npy); [ -e ${filelist[0]} ] && rm *.npy + filelist=(*.pickle); [ -e ${filelist[0]} ] && rm *.pickle + filelist=(.cov_reg_out*); [ -e ${filelist[0]} ] && rm .cov_reg_out* + filelist=(*active_runs.txt); [ -e ${filelist[0]} ] && rm *active_runs.txt + filelist=(*.err); [ -e ${filelist[0]} ] && rm *.err + filelist=(outfile*.txt); [ -e ${filelist[0]} ] && rm outfile*.txt + filelist=(machinefile*); [ -e ${filelist[0]} ] && rm machinefile* + filelist=(libe_task_*.out); [ -e ${filelist[0]} ] && rm libe_task_*.out + filelist=(*libE_stats.txt*); [ -e ${filelist[0]} ] && rm *libE_stats.txt* + filelist=(my_simtask.x); [ -e ${filelist[0]} ] && rm my_simtask.x + filelist=(libe_stat_files); [ -e ${filelist[0]} ] && rm -r libe_stat_files + filelist=(ensemble.log); [ -e ${filelist[0]} ] && rm ensemble.log + filelist=(ensemble_*); [ -e ${filelist[0]} ] && rm -r ensemble_* + filelist=(sim_*); [ -e ${filelist[0]} ] && rm -r sim_* + filelist=(gen_*); [ -e ${filelist[0]} ] && rm -r gen_* + filelist=(nodelist_*); [ -e ${filelist[0]} ] && rm nodelist_* + filelist=(x_*.txt y_*.txt); [ -e ${filelist[0]} ] && rm x_*.txt y_*.txt + filelist=(opt_*.txt_flag); [ -e ${filelist[0]} ] && rm opt_*.txt_flag cd $THISDIR } @@ -199,15 +203,17 @@ usage() { echo " -t Run the regression tests using TCP comms" echo " -p {version} Select a version of python. E.g. -p 2 will run with the python2 exe" echo " Note: This will literally run the python2/python3 exe. Default runs python" + echo " -A {-flag arg} Supply arguments to python" echo " -n {name} Supply a name to this test run" echo " -a {args} Supply a string of args to add to mpiexec line" + echo " -y {args} Supply a list of regression tests as a reg. expression e.g. '-y test_persistent_aposmm*'" echo "" echo "Note: If none of [-mlt] are given, the default is to run tests for all comms" echo "" exit 1 } -while getopts ":p:n:a:hcszurmlt" opt; do +while getopts ":p:n:a:y:A:hcszurmlt" opt; do case $opt in p) echo "Parameter supplied for Python version: $OPTARG" >&2 @@ -253,6 +259,14 @@ while getopts ":p:n:a:hcszurmlt" opt; do echo "Running only the MPI regression tests" export RUN_MPI=true ;; + y) + echo "Running with user supplied test list" + export REG_TEST_LIST="$OPTARG" + ;; + A) + echo "Python arguments passed: $OPTARG" >&2 + PYTHON_FLAGS="$PYTHON_FLAGS $OPTARG" + ;; h) usage ;; @@ -312,14 +326,14 @@ if [ $CLEAN_ONLY = "true" ]; then fi; #If not supplied will go to just python (no number) - eg. with tox/virtual envs -PYTHON_RUN=python$PYTHON_VER +PYTHON_RUN="python$PYTHON_VER $PYTHON_FLAGS" echo -e "Python run: $PYTHON_RUN" textreset=$(tput sgr0) fail_color=$(tput bold;tput setaf 1) #red pass_color=$(tput bold;tput setaf 2) #green -titl_colour=$(tput bold;tput setaf 6) #cyan -hint_colour=$(tput bold;tput setaf 4) #blue +titl_color=$(tput bold;tput setaf 6) #cyan +hint_color=$(tput bold;tput setaf 4) #blue # Note - pytest exit codes # Exit code 0: All tests were collected and passed successfully @@ -469,6 +483,8 @@ if [ "$root_found" = true ]; then test_num=$((test_num+1)) test_start=$(current_time) + echo -e "\n ${titl_color}---Test $test_num: $TEST_SCRIPT starting with $LAUNCHER on $NPROCS processes ${textreset}" + if [ "$REG_USE_PYTEST" = true ]; then if [ "$LAUNCHER" = mpi ]; then mpiexec -np $NPROCS $MPIEXEC_FLAGS $PYTHON_RUN -m pytest $TEST_SCRIPT >> $TEST_SCRIPT.$NPROCS'procs'.$REG_TEST_OUTPUT_EXT 2>test.err diff --git a/libensemble/tests/scaling_tests/forces/theta_submit_balsam.sh b/libensemble/tests/scaling_tests/forces/theta_submit_balsam.sh index c91614316..87775bbd2 100755 --- a/libensemble/tests/scaling_tests/forces/theta_submit_balsam.sh +++ b/libensemble/tests/scaling_tests/forces/theta_submit_balsam.sh @@ -1,12 +1,13 @@ #!/bin/bash -x #COBALT -t 00:30:00 +#COBALT -O libE_forces_MPI_balsam #COBALT -n 129 #COBALT -q default #COBALT -A -# Script to launch libEnsemble using Balsam within Conda. Conda environment must be set up. - -# Requires Balsam is installed and a database initialized (this can be the default database). +# Script to launch libEnsemble using Balsam. +# Assumes Conda environment is set up. +# Requires Balsam is installed and a database initialized. # To be run with central job management # - Manager and workers run on one node (or a dedicated set of nodes). @@ -18,43 +19,48 @@ export EXE=run_libe_forces.py # Number of workers. export NUM_WORKERS=127 +# Number of nodes to run libE +export LIBE_NODES=2 + # Wallclock for libE job in minutes (supplied to Balsam - make at least several mins smaller than wallclock for this submission to ensure job is launched) export LIBE_WALLCLOCK=25 # Name of working directory where Balsam places running jobs/output (inside the database directory) export WORKFLOW_NAME=libe_workflow -# export SCRIPT_ARGS='' #Default No args -# export SCRIPT_ARGS=$(($LIBE_WALLCLOCK-5)) -export SCRIPT_ARGS="--comms mpi --nworkers $NUM_WORKERS" +# If user script takes ``elapsed_wallclock_time`` argument. +# export SCRIPT_ARGS=$(($LIBE_WALLCLOCK-3)) +export SCRIPT_ARGS="" # Name of Conda environment export CONDA_ENV_NAME= # Name of database -export DBASE_NAME= - -# Conda location - theta specific -# export PATH=/opt/intel/python/2017.0.035/intelpython35/bin:$PATH -# export LD_LIBRARY_PATH=~/.conda/envs/$CONDA_ENV_NAME/lib:$LD_LIBRARY_PATH - -export PYTHONNOUSERSITE=1 #Ensure environment isolated - -export PMI_NO_FORK=1 # Required for python kills on Theta +export BALSAM_DB_NAME= export LIBE_PLOTS=true # Require plot scripts (see at end) export BALSAM_PLOTS=true # Require plot scripts (see at end) export PLOT_DIR=.. -# Activate conda environment -. activate $CONDA_ENV_NAME +# Required for killing tasks from workers on Theta +export PMI_NO_FORK=1 -# Unload Theta modules that may interfere with job monitoring/kills +# Unload Theta modules that may interfere with task monitoring/kills module unload trackdeps module unload darshan module unload xalt -. balsamactivate $DBASE_NAME +# Obtain Conda PATH from miniconda-3/latest module +CONDA_DIR=/soft/datascience/conda/miniconda3/latest/bin + +# Ensure environment isolated +export PYTHONNOUSERSITE=1 + +# Activate conda environment +source $CONDA_DIR/activate $CONDA_ENV_NAME + +# Activate Balsam database +source balsamactivate $BALSAM_DB_NAME # Make sure no existing apps/jobs balsam rm apps --all --force @@ -63,27 +69,30 @@ wait sleep 3 # Add calling script to Balsam database as app and job. -THIS_DIR=$PWD -SCRIPT_BASENAME=${EXE%.*} - -# Running libE on one node - one manager and upto 63 workers -# NUM_NODES=1 -# RANKS_PER_NODE=$((NUM_WORKERS+1)) # One node auto +export THIS_DIR=$PWD +export SCRIPT_BASENAME=${EXE%.*} # Multiple nodes -NUM_NODES=2 -RANKS_PER_NODE=64 - -# All tasks -OUT_FILES_TO_RETURN="*.out *.txt *.log" +export LIBE_PROCS=$((NUM_WORKERS+1)) # Manager and workers +export PROCS_PER_NODE=$((LIBE_PROCS/LIBE_NODES)) # Must divide evenly balsam app --name $SCRIPT_BASENAME.app --exec $EXE --desc "Run $SCRIPT_BASENAME" -balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes $NUM_NODES --ranks-per-node $RANKS_PER_NODE --url-out="local:/$THIS_DIR" --stage-out-files="${OUT_FILES_TO_RETURN}" --url-in="local:/$THIS_DIR/*" --yes +balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME \ + --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS \ + --wall-time-minutes $LIBE_WALLCLOCK \ + --num-nodes $LIBE_NODES --ranks-per-node $PROCS_PER_NODE \ + --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" \ + --url-in="local:/$THIS_DIR/*" --yes # Hyper-thread libE (note this will not affect HT status of user calcs - only libE itself) # E.g. Running 255 workers and one manager on one libE node. -# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes $NUM_NODES --ranks-per-node $RANKS_PER_NODE --threads-per-core 4 --url-out="local:/$THIS_DIR" --stage-out-files="${OUT_FILES_TO_RETURN}" --url-in="local:/$THIS_DIR/*" --yes +# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME \ +# --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS \ +# --wall-time-minutes $LIBE_WALLCLOCK \ +# --num-nodes 1 --ranks-per-node 256 --threads-per-core 4 \ +# --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" \ +# --url-in="local:/$THIS_DIR/*" --yes #Run job balsam launcher --consume-all --job-mode=mpi --num-transition-threads=1 @@ -94,10 +103,10 @@ if [[ $LIBE_PLOTS = "true" ]]; then python $PLOT_DIR/plot_libe_histogram.py if [[ $BALSAM_PLOTS = "true" ]]; then -# export MPLBACKEND=TkAgg python $PLOT_DIR/plot_util_v_time.py python $PLOT_DIR/plot_jobs_v_time.py python $PLOT_DIR/plot_waiting_v_time.py fi -. balsamdeactivate +wait +source balsamdeactivate diff --git a/libensemble/tests/scaling_tests/warpx/readme.txt b/libensemble/tests/scaling_tests/warpx/readme.txt index 2a87c5317..84e29d9dc 100644 --- a/libensemble/tests/scaling_tests/warpx/readme.txt +++ b/libensemble/tests/scaling_tests/warpx/readme.txt @@ -3,8 +3,10 @@ # with the input file in sim/inputs on Summit. ##################################################### -A convergence study, starting from an input file similar to ./sim/inputs (with a different set of physical parameters generated the APOSMM generator in a LibEnsemble run) is presented below. -The only parameter changed to go from one resolution to the next is amr.n_cell. +A convergence study, starting from an input file similar to ./sim/inputs (with +a different set of physical parameters generated the APOSMM generator in a +LibEnsemble run) is presented below. The only parameter changed to go from one +resolution to the next is amr.n_cell. resolution amr.n_cell duration f res0 16 1024 10s 2.6340181298685122e-05 @@ -13,14 +15,21 @@ res2 64 4096 54s 4.557730935993e-06 res3 128 8192 214s 5.683672880520962e-06 res4 256 16384 1092s 5.667607648747023e-06 -I ran the last run on 2 GPUs instead of 1, and it took 780s instead of 1092s, meaning we can run it on either 1 GPU or 2. -That way, we can explore it either having all resolutions on the same number of GPUs, or on different number of GPUs. -In the end, we would probably be interested in the former, which can also be further investigating my going to higher resolution. +I ran the last run on 2 GPUs instead of 1, and it took 780s instead of 1092s, +meaning we can run it on either 1 GPU or 2. That way, we can explore it either +having all resolutions on the same number of GPUs, or on different number of +GPUs. In the end, we would probably be interested in the former, which can also +be further investigating my going to higher resolution. The simulations can be run with the same input file, with the following changes: -- The timeout in the batch script should be larger for higher resolutions (see numbers above) -- The jsrun command should be modified as, e.g., "jsrun ... warpx.exe inputs amr.n_cell=128 8192" for resolution res3 +- The timeout in the batch script should be larger for higher resolutions (see + numbers above) +- The jsrun command should be modified as, e.g., "jsrun ... warpx.exe inputs + amr.n_cell=128 8192" for resolution res3 Note that this test shows a nice converge for a case with relatively high emittance. -In the optimization process, some runs will have a luwer emittance (potentially 10x), and they may require a higher resolution to reach the same level of convergence. -Let us keep this in mind, and maybe add a res5 to the list above. This high-resolution one should also fit on 1 GPU, but it will be worth testing. +In the optimization process, some runs will have a lower emittance (potentially +10x), and they may require a higher resolution to reach the same level of +convergence. Let us keep this in mind, and maybe add a res5 to the list above. +This high-resolution one should also fit on 1 GPU, but it will be worth +testing. diff --git a/libensemble/tests/unit_tests/test_env_resources.py b/libensemble/tests/unit_tests/test_env_resources.py index 2f72c8e13..9bdffa255 100644 --- a/libensemble/tests/unit_tests/test_env_resources.py +++ b/libensemble/tests/unit_tests/test_env_resources.py @@ -176,44 +176,44 @@ def test_lsf_nodelist_shortform_seq(): assert nodelist == exp_out, "Nodelist returned does not match expected" -def test_abbrev_nodenames_nochange_slurm(): +def test_shortnames_nochange_slurm(): env_resources = EnvResources() # Test Slurm abbrev exp_names = ['knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345'] - env_resources.schedular = 'Slurm' - abbrev_names = env_resources.abbrev_nodenames(exp_names) + env_resources.scheduler = 'Slurm' + abbrev_names = env_resources.shortnames(exp_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources -def test_abbrev_nodenames_slurm(): +def test_shortnames_slurm(): env_resources = EnvResources() # Test Slurm abbrev exp_names = ['knl-0019', 'knl-0021', 'knl-0022'] full_names = ['knl-0019.some.suffix', 'knl-0021.some.suffix', 'knl-0022.diff_suffix'] - env_resources.schedular = 'Slurm' - abbrev_names = env_resources.abbrev_nodenames(full_names) + env_resources.scheduler = 'Slurm' + abbrev_names = env_resources.shortnames(full_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources -def test_abbrev_nodenames_nochange_cobalt(): +def test_shortnames_nochange_cobalt(): env_resources = EnvResources() # Test Cobalt abbrev exp_names = ['21', '22', '137', '138', '1234', '11234'] - env_resources.schedular = 'Cobalt' - abbrev_names = env_resources.abbrev_nodenames(exp_names) + env_resources.scheduler = 'Cobalt' + abbrev_names = env_resources.shortnames(exp_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources -def test_abbrev_nodenames_cobalt(): +def test_shortnames_cobalt(): env_resources = EnvResources() # Test Cobalt abbrev exp_names = ['20', '21', '22', '137', '138', '1234', '11234'] full_names = ['nid00020', 'nid00021', 'nid00022', 'nid00137', 'nid00138', 'nid01234', 'nid11234'] - env_resources.schedular = 'Cobalt' - abbrev_names = env_resources.abbrev_nodenames(full_names) + env_resources.scheduler = 'Cobalt' + abbrev_names = env_resources.shortnames(full_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources @@ -246,9 +246,9 @@ def test_abbrev_nodenames_cobalt(): test_lsf_nodelist_shortform_single() test_lsf_nodelist_shortform_seq() - test_abbrev_nodenames_nochange_slurm() - test_abbrev_nodenames_slurm() - test_abbrev_nodenames_nochange_cobalt() - test_abbrev_nodenames_cobalt() + test_shortnames_nochange_slurm() + test_shortnames_slurm() + test_shortnames_nochange_cobalt() + test_shortnames_cobalt() teardown_standalone_run() diff --git a/libensemble/tests/unit_tests/test_executor.py b/libensemble/tests/unit_tests/test_executor.py index a3a8d8cc1..edea3d94c 100644 --- a/libensemble/tests/unit_tests/test_executor.py +++ b/libensemble/tests/unit_tests/test_executor.py @@ -2,12 +2,13 @@ # Integration Test of executor module for libensemble # Test does not require running full libensemble import os +import re import sys import time import pytest import socket from libensemble.resources.resources import ResourcesException -from libensemble.executors.executor import Executor, ExecutorException +from libensemble.executors.executor import Executor, ExecutorException, TimeoutExpired from libensemble.executors.executor import NOT_STARTED_STATES @@ -195,6 +196,39 @@ def test_launch_and_poll(): assert task.run_attempts == 1, "task.run_attempts should be 1. Returned " + str(task.run_attempts) +def test_launch_and_wait(): + """ Test of launching and waiting on task""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + setup_executor() + exctr = Executor.executor + cores = NCORES + args_for_sim = 'sleep 1' + task = exctr.submit(calc_type='sim', num_procs=cores, app_args=args_for_sim) + task.wait() + assert task.finished, "task.finished should be True. Returned " + str(task.finished) + assert task.state == 'FINISHED', "task.state should be FINISHED. Returned " + str(task.state) + task.wait() # Already complete + assert task.finished, "task.finished should be True. Returned " + str(task.finished) + assert task.state == 'FINISHED', "task.state should be FINISHED. Returned " + str(task.state) + + +def test_launch_and_wait_timeout(): + """ Test of launching and waiting on task timeout (and kill)""" + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + setup_executor() + exctr = Executor.executor + cores = NCORES + args_for_sim = 'sleep 5' + task = exctr.submit(calc_type='sim', num_procs=cores, app_args=args_for_sim) + try: + task.wait(timeout=0.5) + except TimeoutExpired: + assert not task.finished, "task.finished should be False. Returned " + str(task.finished) + task.kill() + assert task.finished, "task.finished should be True. Returned " + str(task.finished) + assert task.state == 'USER_KILLED', "task.state should be USER_KILLED. Returned " + str(task.state) + + def test_launch_wait_on_run(): """ Test of launching task with wait_on_run """ print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) @@ -473,6 +507,12 @@ def test_launch_no_app(): assert e.args[0] == 'Default sim app is not set' else: assert 0 + try: + _ = exctr.submit(num_procs=cores, app_args=args_for_sim) + except ExecutorException as e: + assert e.args[0] == 'Either app_name or calc_type must be set' + else: + assert 0 def test_kill_task_with_no_submit(): @@ -490,13 +530,17 @@ def test_kill_task_with_no_submit(): assert 0 # Create a task directly with no submit (Not supported for users) + # Debatably make taskID 0 as executor should be deleted if use setup function. + # But this allows any task ID. + exp_msg = ('Attempting to kill task libe_task_my_simtask.x_.+that has ' + 'no process ID - check tasks been launched') + exp_re = re.compile(exp_msg) myapp = exctr.sim_default_app task1 = Task(app=myapp, stdout='stdout.txt') try: exctr.kill(task1) except ExecutorException as e: - assert e.args[0][:50] == 'Attempting to kill task task_my_simtask.x.simfunc_' - assert e.args[0][52:] == ' that has no process ID - check tasks been launched' + assert bool(re.match(exp_re, e.args[0])) else: assert 0 @@ -508,13 +552,15 @@ def test_poll_task_with_no_submit(): exctr = Executor.executor # Create a task directly with no submit (Not supported for users) + exp_msg = ('Polled task libe_task_my_simtask.x_.+ ' + 'has no process ID - check tasks been launched') + exp_re = re.compile(exp_msg) myapp = exctr.sim_default_app task1 = Task(app=myapp, stdout='stdout.txt') try: task1.poll() except ExecutorException as e: - assert e.args[0][:38] == 'Polled task task_my_simtask.x.simfunc_' - assert e.args[0][40:] == ' has no process ID - check tasks been launched' + assert bool(re.match(exp_re, e.args[0])) else: assert 0 @@ -556,9 +602,45 @@ def test_retries_run_fail(): assert task.run_attempts == 5, "task.run_attempts should be 5. Returned " + str(task.run_attempts) +def test_register_apps(): + print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) + setup_executor() # This registers an app my_simtask.x (default sim) + exctr = Executor.executor + exctr.register_calc(full_path='/path/to/fake_app1.x', app_name='fake_app1') + exctr.register_calc(full_path='/path/to/fake_app2.py', app_name='fake_app2') + + # Check selected attributes + app = exctr.get_app('my_simtask.x') + assert app.name == 'my_simtask.x' + assert app.gname == 'libe_app_my_simtask.x' + + app = exctr.get_app('fake_app1') + assert app.name == 'fake_app1' + assert app.gname == 'libe_app_fake_app1' + assert app.exe == 'fake_app1.x' + assert app.calc_dir == '/path/to' + + app = exctr.get_app('fake_app2') + assert app.name == 'fake_app2' + assert app.gname == 'libe_app_fake_app2' + + py_exe, app_exe = app.full_path.split() + assert os.path.split(py_exe)[1].startswith('python') + assert app_exe == '/path/to/fake_app2.py' + + try: + app = exctr.get_app('fake_app3') + except ExecutorException as e: + assert e.args[0] == 'Application fake_app3 not found in registry' + # Ordering of dictionary may vary + # assert e.args[1] == "Registered applications: ['my_simtask.x', 'fake_app1', 'fake_app2']" + + if __name__ == "__main__": # setup_module(__file__) test_launch_and_poll() + test_launch_and_wait() + test_launch_and_wait_timeout() test_launch_wait_on_run() test_kill_on_file() test_kill_on_timeout() @@ -576,4 +658,5 @@ def test_retries_run_fail(): test_task_failure() test_retries_launch_fail() test_retries_run_fail() + test_register_apps() # teardown_module(__file__) diff --git a/libensemble/tests/unit_tests/test_persistent_aposmm.py b/libensemble/tests/unit_tests/test_persistent_aposmm.py index 01939aa41..0f9bf8d98 100644 --- a/libensemble/tests/unit_tests/test_persistent_aposmm.py +++ b/libensemble/tests/unit_tests/test_persistent_aposmm.py @@ -5,7 +5,7 @@ libE_specs = {'comm': {}} -def test_persis_apossm_localopt_test(): +def test_persis_aposmm_localopt_test(): _, _, gen_specs_0, _, _ = setup.hist_setup1() H = np.zeros(4, dtype=[('f', float), ('sim_id', bool), ('dist_to_unit_bounds', float), ('returned', bool)]) @@ -107,6 +107,6 @@ def combined_func(x): if __name__ == "__main__": - test_persis_apossm_localopt_test() + test_persis_aposmm_localopt_test() test_update_history_optimal() test_standalone_persistent_aposmm() diff --git a/libensemble/tests/unit_tests/test_resources.py b/libensemble/tests/unit_tests/test_resources.py index fc18af1fc..0a610b590 100644 --- a/libensemble/tests/unit_tests/test_resources.py +++ b/libensemble/tests/unit_tests/test_resources.py @@ -38,6 +38,11 @@ def teardown_function(function): os.remove('node_list') +def sname(name): + print('sname being set') + return name.split(".", 1)[0] + + # Tests ======================================================================================== # Tests Resources.get_global_nodelist (This requires above tests to work) @@ -87,12 +92,13 @@ def test_get_global_nodelist_frm_lsf_shortform(): def test_get_global_nodelist_standalone(): mynode = socket.gethostname() + exp_node = mynode # sname(mynode) env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_cobalt="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET") global_nodelist = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources) - assert global_nodelist == [mynode], "global_nodelist returned does not match expected" + assert global_nodelist == [exp_node], "global_nodelist returned does not match expected" def test_get_global_nodelist_frm_wrklst_file(): @@ -196,7 +202,7 @@ def test_get_local_nodelist_central_mode_remove_libE_proc(): f.write(mynode + '\n') resources = Resources(central_mode=True) - resources.add_comm_info(libE_nodes=mynode) + resources.add_comm_info(libE_nodes=[mynode]) # Now mock up some more stuff - so consistent @@ -283,19 +289,20 @@ def test_get_local_nodelist_distrib_mode(): # assert 0 workerID = 5 - exp_out = [mynode] + exp_node = mynode # sname(mynode) + exp_out = [exp_node] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" num_workers = 1 workerID = 1 - exp_out = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', mynode, 'knl-0137', 'knl-0138', 'knl-0139'] + exp_out = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', exp_node, 'knl-0137', 'knl-0138', 'knl-0139'] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" num_workers = 4 workerID = 3 - exp_out = [mynode, 'knl-0137'] + exp_out = [exp_node, 'knl-0137'] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" @@ -303,12 +310,12 @@ def test_get_local_nodelist_distrib_mode(): num_workers = 16 workerID = 9 - exp_out = [mynode] + exp_out = [exp_node] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" workerID = 10 - exp_out = [mynode] + exp_out = [exp_node] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" @@ -317,6 +324,7 @@ def test_get_local_nodelist_distrib_mode(): def test_get_local_nodelist_distrib_mode_uneven_split(): mynode = socket.gethostname() + exp_node = mynode # sname(mynode) nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234'] with open('node_list', 'w') as f: for i, node in enumerate(nodelist_in): @@ -329,7 +337,7 @@ def test_get_local_nodelist_distrib_mode_uneven_split(): # May not be at head of list - should perhaps be warning or enforced workerID = 2 - exp_out = ['knl-0137', mynode, 'knl-0138', 'knl-0139'] + exp_out = ['knl-0137', exp_node, 'knl-0138', 'knl-0139'] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" os.remove('node_list') @@ -387,6 +395,34 @@ def test_worker_resources(): assert worker.workers_per_node == 2, 'worker.workers_per_node does not match' +def test_map_workerid_to_index(): + num_workers = 4 + zero_resource_list = [] + + for workerID in range(1, num_workers+1): + index = WorkerResources.map_workerid_to_index(num_workers, workerID, zero_resource_list) + assert index == workerID - 1, "index incorrect. Received: " + str(index) + + zero_resource_list = [1] + for workerID in range(2, num_workers+1): + index = WorkerResources.map_workerid_to_index(num_workers, workerID, zero_resource_list) + assert index == workerID - 2, "index incorrect. Received: " + str(index) + + zero_resource_list = [1, 2] + for workerID in range(3, num_workers+1): + index = WorkerResources.map_workerid_to_index(num_workers, workerID, zero_resource_list) + assert index == workerID - 3, "index incorrect. Received: " + str(index) + + zero_resource_list = [1, 3] + workerID = 2 + index = WorkerResources.map_workerid_to_index(num_workers, workerID, zero_resource_list) + assert index == 0, "index incorrect. Received: " + str(index) + + workerID = 4 + index = WorkerResources.map_workerid_to_index(num_workers, workerID, zero_resource_list) + assert index == 1, "index incorrect. Received: " + str(index) + + if __name__ == "__main__": setup_standalone_run() @@ -405,5 +441,6 @@ def test_worker_resources(): test_get_local_nodelist_distrib_mode_uneven_split() test_worker_resources() + test_map_workerid_to_index() teardown_standalone_run() diff --git a/libensemble/tests/unit_tests/test_sim_dir_properties.py b/libensemble/tests/unit_tests/test_sim_dir_properties.py index 7b3c345e2..204af4ba5 100644 --- a/libensemble/tests/unit_tests/test_sim_dir_properties.py +++ b/libensemble/tests/unit_tests/test_sim_dir_properties.py @@ -76,54 +76,6 @@ def __init__(self, libE_specs, prefix, startdir, loc_stack): shutil.rmtree(dir) -def test_copy_back_exception(): - """ Test _copy_back handling of FileExistsError with certain - settings""" - class FakeWorker: - """ Enough information to test _copy_back() """ - def __init__(self, libE_specs, prefix, startdir, loc_stack): - self.libE_specs = libE_specs - self.prefix = prefix - self.startdir = startdir - self.loc_stack = loc_stack - - inputdir = './calc' - copybackdir = './calc_back' - inputfile = './calc/file' - - for dir in [inputdir, copybackdir]: - os.makedirs(dir, exist_ok=True) - - libE_specs = {'sim_dirs_make': False, 'ensemble_dir_path': inputdir, - 'ensemble_copy_back': True} - - ls = LocationStack() - ls.register_loc('test', inputfile) - fake_worker = FakeWorker(libE_specs, inputdir, '.', ls) - - # Testing catch and continue - for i in range(2): - Worker._copy_back(fake_worker) - assert 'file' in os.listdir(copybackdir), \ - 'File not copied back to starting dir' - - libE_specs = {'sim_dirs_make': True, 'ensemble_dir_path': inputdir, - 'ensemble_copy_back': True} - fake_worker = FakeWorker(libE_specs, inputdir, '.', ls) - - flag = 1 - - # Testing catch and raise - try: - Worker._copy_back(fake_worker) - except FileExistsError: - flag = 0 - assert flag == 0 - - for dir in [inputdir, copybackdir]: - shutil.rmtree(dir) - - def test_worker_dirs_but_no_sim_dirs(): """Test Worker._make_calc_dir() directory structure without sim_dirs""" inputdir = './calc' @@ -204,6 +156,5 @@ def test_loc_stack_FileExists_exceptions(): test_range_two_ranges() test_range_mixes() test_copy_back() - test_copy_back_exception() test_worker_dirs_but_no_sim_dirs() test_loc_stack_FileExists_exceptions() diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index 3a39692a8..19e5718b5 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -3,7 +3,11 @@ def avail_worker_ids(W, persistent=None): - "Get available workers (active == 0), filtered by persis_state." + """Returns available workers (``active == 0``), as an array, filtered by ``persis_state``. + + :param W: :doc:`Worker array<../data_structures/worker_array>` + :param persistent: Optional Boolean. If specified, also return workers with given persis_state. + """ if persistent is None: return W['worker_id'][W['active'] == 0] if persistent: @@ -14,22 +18,39 @@ def avail_worker_ids(W, persistent=None): def count_gens(W): - "Return the number of generators in a set of workers." + """Return the number of active generators in a set of workers. + + :param W: :doc:`Worker array<../data_structures/worker_array>` + """ return sum(W['active'] == EVAL_GEN_TAG) def test_any_gen(W): - "Return True if a generator worker is active." + """Return True if a generator worker is active. + + :param W: :doc:`Worker array<../data_structures/worker_array>` + """ return any(W['active'] == EVAL_GEN_TAG) def count_persis_gens(W): - "Return the number of persistent generators in a set of workers." + """Return the number of active persistent generators in a set of workers. + + :param W: :doc:`Worker array<../data_structures/worker_array>` + """ return sum(W['persis_state'] == EVAL_GEN_TAG) def sim_work(Work, i, H_fields, H_rows, persis_info, **libE_info): - "Add sim work record to work array." + """Add sim work record to given Work array. + + :param W: :doc:`Worker array<../data_structures/worker_array>` + :param i: Worker ID. + :param H_fields: Which fields from H to send + :param persis_info: current persis_info dictionary + + :returns: None + """ libE_info['H_rows'] = H_rows Work[i] = {'H_fields': H_fields, 'persis_info': persis_info, @@ -38,7 +59,15 @@ def sim_work(Work, i, H_fields, H_rows, persis_info, **libE_info): def gen_work(Work, i, H_fields, H_rows, persis_info, **libE_info): - "Add gen work record to work array." + """Add gen work record to given Work array. + + :param W: :doc:`Worker array<../data_structures/worker_array>` + :param i: Worker ID. + :param H_fields: Which fields from H to send + :param persis_info: current persis_info dictionary + + :returns: None + """ libE_info['H_rows'] = H_rows Work[i] = {'H_fields': H_fields, 'persis_info': persis_info, diff --git a/libensemble/tools/fields_keys.py b/libensemble/tools/fields_keys.py index b54e74432..0c4549704 100644 --- a/libensemble/tools/fields_keys.py +++ b/libensemble/tools/fields_keys.py @@ -27,13 +27,23 @@ 'out', # 'user'] # -libE_spec_calc_dir_keys = ['sim_dirs_make', - 'ensemble_copy_back', - 'sim_dir_copy_files', +libE_spec_calc_dir_misc = ['ensemble_copy_back', 'ensemble_dir_path', - 'use_worker_dirs', - 'sim_dir_symlink_files', - 'sim_input_dir'] + 'use_worker_dirs'] + +libE_spec_sim_dir_keys = ['sim_dirs_make', + 'sim_dir_copy_files', + 'sim_dir_symlink_files', + 'sim_input_dir'] + +libE_spec_gen_dir_keys = ['gen_dirs_make', + 'gen_dir_copy_files', + 'gen_dir_symlink_files', + 'gen_input_dir'] + +libE_spec_calc_dir_combined = libE_spec_calc_dir_misc + \ + libE_spec_sim_dir_keys + \ + libE_spec_gen_dir_keys allowed_libE_spec_keys = ['abort_on_exception', # 'authkey', # @@ -49,4 +59,5 @@ 'save_H_and_persis_on_abort', # 'workerID', # 'worker_timeout', # - 'worker_cmd'] + libE_spec_calc_dir_keys + 'zero_resource_workers', # + 'worker_cmd'] + libE_spec_calc_dir_combined diff --git a/libensemble/tools/gen_support.py b/libensemble/tools/gen_support.py index 344468f7b..2f129f133 100644 --- a/libensemble/tools/gen_support.py +++ b/libensemble/tools/gen_support.py @@ -1,15 +1,23 @@ from libensemble.message_numbers import STOP_TAG, PERSIS_STOP, UNSET_TAG, EVAL_GEN_TAG -def sendrecv_mgr_worker_msg(comm, output, status=None): +def sendrecv_mgr_worker_msg(comm, output): """Send message from worker to manager and receive response. + + :param comm: libEnsemble communicator object + :param output: Output array to be sent to manager + :returns: message tag, Work dictionary, calc_in array """ send_mgr_worker_msg(comm, output) - return get_mgr_worker_msg(comm, status=status) + return get_mgr_worker_msg(comm) def send_mgr_worker_msg(comm, output): """Send message from worker to manager. + + :param comm: libEnsemble communicator object + :param output: Output array to be sent to manager + :returns: None """ D = {'calc_out': output, 'libE_info': {'persistent': True}, @@ -19,8 +27,11 @@ def send_mgr_worker_msg(comm, output): comm.send(EVAL_GEN_TAG, D) -def get_mgr_worker_msg(comm, status=None): +def get_mgr_worker_msg(comm): """Get message to worker from manager. + + :param comm: libEnsemble communicator object + :returns: message tag, Work dictionary, calc_in array """ tag, Work = comm.recv() if tag in [STOP_TAG, PERSIS_STOP]: diff --git a/libensemble/tools/tools.py b/libensemble/tools/tools.py index 3909b84a2..96cfa3bd9 100644 --- a/libensemble/tools/tools.py +++ b/libensemble/tools/tools.py @@ -35,15 +35,14 @@ ('\n' + 79*'*' + '\n' + "User generator script will be creating sim_id.\n" + "Take care to do this sequentially.\n" + - "Also, any information given back for existing sim_id values will be overwritten!\n" + - "So everything in gen_specs['out'] should be in gen_specs['in']!" + + "Information given back to the gen_f for existing sim_id values may be overwritten!\n" + '\n' + 79*'*' + '\n\n') # ==================== Ensemble directory re-use error ========================= -_USER_SIM_DIR_WARNING = \ +_USER_CALC_DIR_WARNING = \ ('\n' + 79*'*' + '\n' + - "libEnsemble attempted to reuse {} as a parent directory for sim_dirs.\n" + + "libEnsemble attempted to reuse {} as a parent directory for calc dirs.\n" + "If allowed to continue, previous results may have been overwritten!\n" + "Resolve this by ensuring libE_specs['ensemble_dir_path'] is unique for each run." + '\n' + 79*'*' + '\n\n') diff --git a/postproc_scripts/compare_npy.py b/postproc_scripts/compare_npy.py index c408552ce..b1e38ffae 100755 --- a/postproc_scripts/compare_npy.py +++ b/postproc_scripts/compare_npy.py @@ -2,34 +2,58 @@ '''Script to compare libEnsemble history arrays in files. +E.g., ./compare_npy.py out1.npy out2.npy + If two *.npy files are provided they are compared with each other. If one *.npy file is provided if is compared with a hard-coded expected file (by default located at ../expected.npy) -Default NumPy tolerances are used for comparison (rtol=1e-05, atol=1e-08) and + +Default tolerances used for comparison are (rtol=1e-05, atol=1e-08). These +can be overwritten with -r (--rtol) and -a (--atol) flags. + +E.g., ./compare_npy.py out1.npy out2.npy -r 1e-03 + Nans compare as equal. Variable fields (such as those containing a time) -are ignored. -''' +are ignored. In some cases you may have to ignore further user-defined fields +''' import sys import numpy as np +import argparse -if len(sys.argv) > 2: - results = np.load(sys.argv[1]) - exp_results = np.load(sys.argv[2]) -elif len(sys.argv) > 1: - results = np.load(sys.argv[1]) - exp_results_file = "../expected.npy" - exp_results = np.load(exp_results_file) -else: - print('You need to supply an .npy file - aborting') - sys.exit() +desc = "Script to compare libEnsemble history arrays in files" +exmple = '''examples: + + ./compare_npy.py out1.npy out2.npy + ./compare_npy.py out1.npy out2.npy --rtol 1e-03 --atol 1e-06 + ''' exclude_fields = ['gen_worker', 'sim_worker', 'gen_time', 'given_time'] # list of fields to ignore locate_mismatch = True +parser = argparse.ArgumentParser(description=desc, epilog=exmple, + formatter_class=argparse.RawDescriptionHelpFormatter) +parser.add_argument('-r', '--rtol', dest='rtol', type=float, default=1e-05, help='rel. tolerance') +parser.add_argument('-a', '--atol', dest='atol', type=float, default=1e-08, help='abs. tolerance') +parser.add_argument('args', nargs='*', help='*.npy files to compare') +args = parser.parse_args() + +rtol = args.rtol +atol = args.atol +files = args.args + +if len(files) >= 1: + results = np.load(files[0]) + exp_results = np.load(files[1]) if len(files) >= 2 else np.load("../expected.npy") +else: + parser.print_help() + sys.exit() + compare_fields = tuple(filter(lambda x: x not in exclude_fields, exp_results.dtype.names)) +match = all([np.allclose(exp_results[name], results[name], + rtol=rtol, atol=atol, equal_nan=True) + for name in compare_fields]) -match = all([np.allclose(exp_results[name], results[name], equal_nan=True) for name in compare_fields]) print('Compare results: {}\n'.format(match)) if not locate_mismatch: @@ -38,6 +62,7 @@ if not match: for name in compare_fields: for i in range(len(results)): - assert np.isclose(exp_results[name][i], results[name][i], equal_nan=True), \ + assert np.allclose(exp_results[name][i], + results[name][i], rtol=rtol, atol=atol, equal_nan=True), \ 'Mismatch in row ' + str(i) + ' field: ' + name + '. ' \ + str(exp_results[name][i]) + ' ' + str(results[name][i]) diff --git a/postproc_scripts/plot_pareto_3d.py b/postproc_scripts/plot_pareto_3d.py index 0c1fb33b8..6795896db 100755 --- a/postproc_scripts/plot_pareto_3d.py +++ b/postproc_scripts/plot_pareto_3d.py @@ -3,6 +3,11 @@ import sys import matplotlib.pyplot as plt +# The following is not explicitly called but is needed for 3d plotting to work +# with older versions of python/matplotlib. It is not needed for python3.8 with +# matplotlib version 3.2.1. +from mpl_toolkits import mplot3d # noqa + # Loop through objective points in f and extract the Pareto front. # input: f is a list (dimensions n X p) of n p-dimensional objective points. diff --git a/postproc_scripts/readme.rst b/postproc_scripts/readme.rst index e800478e0..a6a447bf9 100644 --- a/postproc_scripts/readme.rst +++ b/postproc_scripts/readme.rst @@ -25,7 +25,7 @@ Results analysis scripts * ``print_npy.py``: Prints to screen from a given ``*.npy`` file containing a NumPy structured array. Use ``done`` to print only the lines containing - ``''returned'`` points. Example:: + ``'returned'`` points. Example:: ./print_npy.py run_libe_forces_results_History_length=1000_evals=8.npy done diff --git a/setup.py b/setup.py index de05e315b..df23f3c5e 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ def run_tests(self): setup( name='libensemble', - version='0.7.0', + version='0.7.1', description='Library to coordinate the concurrent evaluation of dynamic ensembles of calculations', url='https://github.com/Libensemble/libensemble', author='Jeffrey Larson, Stephen Hudson, Stefan M. Wild, David Bindel and John-Luke Navarro', @@ -53,7 +53,7 @@ def run_tests(self): package_data={'libensemble.sim_funcs.branin': ['known_minima_and_func_values']}, - install_requires=['numpy'], + install_requires=['numpy', 'psutil'], # If run tests through setup.py - downloads these but does not install tests_require=['pytest>=3.1', @@ -64,8 +64,8 @@ def run_tests(self): ], extras_require={ - 'extras': ['scipy', 'nlopt', 'mpi4py', 'petsc', 'petsc4py', 'DFO-LS', 'deap'], - 'docs': ['sphinxcontrib.bibtex']}, + 'extras': ['scipy', 'nlopt', 'mpi4py', 'petsc', 'petsc4py', 'DFO-LS', 'deap', 'mpmath'], + 'docs': ['sphinx', 'sphinxcontrib.bibtex', 'sphinx_rtd_theme']}, classifiers=[ 'Development Status :: 4 - Beta',